* [PATCH v7 1/4] drm/amdgpu: update xgmi info and vram_base_offset on resume
2025-05-22 3:43 [PATCH v7 0/4] enable xgmi node migration support for hibernate on SRIOV Samuel Zhang
@ 2025-05-22 3:43 ` Samuel Zhang
2025-05-22 7:07 ` Lazar, Lijo
2025-05-22 3:43 ` [PATCH v7 2/4] drm/amdgpu: update GPU addresses for SMU and PSP Samuel Zhang
` (2 subsequent siblings)
3 siblings, 1 reply; 13+ messages in thread
From: Samuel Zhang @ 2025-05-22 3:43 UTC (permalink / raw)
To: amd-gfx
Cc: victor.zhao, haijun.chang, guoqing.zhang, Christian.Koenig,
Alexander.Deucher, Owen.Zhang2, Qing.Ma, Lijo.Lazar, Emily.Deng,
Jiang Liu, Christian König
For SRIOV VM env with XGMI enabled systems, XGMI physical node id may
change when hibernate and resume with different VF.
Update XGMI info and vram_base_offset on resume for gfx444 SRIOV env.
Add amdgpu_virt_xgmi_migrate_enabled() as the feature flag.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 7 +++++
2 files changed, 39 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d477a901af84..e5bb46effb6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2732,6 +2732,12 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
+ adev->virt.is_xgmi_node_migrate_enabled = false;
+ if (amdgpu_sriov_vf(adev)) {
+ adev->virt.is_xgmi_node_migrate_enabled =
+ amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4);
+ }
+
total = true;
for (i = 0; i < adev->num_ip_blocks; i++) {
ip_block = &adev->ip_blocks[i];
@@ -5040,6 +5046,28 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
return 0;
}
+static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
+{
+ int r;
+ unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
+
+ if (!amdgpu_virt_xgmi_migrate_enabled(adev))
+ return 0;
+
+ r = adev->gfxhub.funcs->get_xgmi_info(adev);
+ if (r)
+ return r;
+
+ dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
+ prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
+
+ adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev);
+ adev->vm_manager.vram_base_offset +=
+ adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
+
+ return 0;
+}
+
/**
* amdgpu_device_resume - initiate device resume
*
@@ -5061,6 +5089,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
return r;
}
+ r = amdgpu_virt_resume(adev);
+ if (r)
+ goto exit;
+
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index df03dba67ab8..532b92628171 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -295,6 +295,9 @@ struct amdgpu_virt {
union amd_sriov_ras_caps ras_telemetry_en_caps;
struct amdgpu_virt_ras ras;
struct amd_sriov_ras_telemetry_error_count count_cache;
+
+ /* hibernate and resume with different VF feature for xgmi enabled system */
+ bool is_xgmi_node_migrate_enabled;
};
struct amdgpu_video_codec_info;
@@ -376,6 +379,10 @@ static inline bool is_virtual_machine(void)
((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE)
#define amdgpu_sriov_is_mes_info_enable(adev) \
((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)
+
+#define amdgpu_virt_xgmi_migrate_enabled(adev) \
+ ((adev)->virt.is_xgmi_node_migrate_enabled)
+
bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
void amdgpu_virt_init_setting(struct amdgpu_device *adev);
int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH v7 1/4] drm/amdgpu: update xgmi info and vram_base_offset on resume
2025-05-22 3:43 ` [PATCH v7 1/4] drm/amdgpu: update xgmi info and vram_base_offset on resume Samuel Zhang
@ 2025-05-22 7:07 ` Lazar, Lijo
2025-05-22 7:51 ` Lazar, Lijo
0 siblings, 1 reply; 13+ messages in thread
From: Lazar, Lijo @ 2025-05-22 7:07 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Christian.Koenig, Alexander.Deucher,
Owen.Zhang2, Qing.Ma, Emily.Deng, Jiang Liu
On 5/22/2025 9:13 AM, Samuel Zhang wrote:
> For SRIOV VM env with XGMI enabled systems, XGMI physical node id may
> change when hibernate and resume with different VF.
>
> Update XGMI info and vram_base_offset on resume for gfx444 SRIOV env.
> Add amdgpu_virt_xgmi_migrate_enabled() as the feature flag.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> Reviewed-by: Christian König <christian.koenig@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 7 +++++
> 2 files changed, 39 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d477a901af84..e5bb46effb6c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2732,6 +2732,12 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
> if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
> adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
>
> + adev->virt.is_xgmi_node_migrate_enabled = false;
> + if (amdgpu_sriov_vf(adev)) {
> + adev->virt.is_xgmi_node_migrate_enabled =
> + amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4);
> + }
> +
> total = true;
> for (i = 0; i < adev->num_ip_blocks; i++) {
> ip_block = &adev->ip_blocks[i];
> @@ -5040,6 +5046,28 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
> return 0;
> }
>
> +static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> +{
> + int r;
> + unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
> +
> + if (!amdgpu_virt_xgmi_migrate_enabled(adev))
> + return 0;
> +
> + r = adev->gfxhub.funcs->get_xgmi_info(adev);
> + if (r)
> + return r;
> +
> + dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
> + prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
> +
> + adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev);
> + adev->vm_manager.vram_base_offset +=
> + adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
> +
> + return 0;
> +}
> +
> /**
> * amdgpu_device_resume - initiate device resume
> *
> @@ -5061,6 +5089,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
> return r;
> }
>
> + r = amdgpu_virt_resume(adev);
You may restrict this to VF only for now -
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_virt_resume(adev);
...
}
Thanks,
Lijo
> + if (r)
> + goto exit;
> +
> if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
> return 0;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index df03dba67ab8..532b92628171 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,6 +295,9 @@ struct amdgpu_virt {
> union amd_sriov_ras_caps ras_telemetry_en_caps;
> struct amdgpu_virt_ras ras;
> struct amd_sriov_ras_telemetry_error_count count_cache;
> +
> + /* hibernate and resume with different VF feature for xgmi enabled system */
> + bool is_xgmi_node_migrate_enabled;
> };
>
> struct amdgpu_video_codec_info;
> @@ -376,6 +379,10 @@ static inline bool is_virtual_machine(void)
> ((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE)
> #define amdgpu_sriov_is_mes_info_enable(adev) \
> ((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)
> +
> +#define amdgpu_virt_xgmi_migrate_enabled(adev) \
> + ((adev)->virt.is_xgmi_node_migrate_enabled)
> +
> bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
> void amdgpu_virt_init_setting(struct amdgpu_device *adev);
> int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v7 1/4] drm/amdgpu: update xgmi info and vram_base_offset on resume
2025-05-22 7:07 ` Lazar, Lijo
@ 2025-05-22 7:51 ` Lazar, Lijo
0 siblings, 0 replies; 13+ messages in thread
From: Lazar, Lijo @ 2025-05-22 7:51 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Christian.Koenig, Alexander.Deucher,
Owen.Zhang2, Qing.Ma, Emily.Deng, Jiang Liu
On 5/22/2025 12:37 PM, Lazar, Lijo wrote:
>
>
> On 5/22/2025 9:13 AM, Samuel Zhang wrote:
>> For SRIOV VM env with XGMI enabled systems, XGMI physical node id may
>> change when hibernate and resume with different VF.
>>
>> Update XGMI info and vram_base_offset on resume for gfx444 SRIOV env.
>> Add amdgpu_virt_xgmi_migrate_enabled() as the feature flag.
>>
>> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
>> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
>> Reviewed-by: Christian König <christian.koenig@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 7 +++++
>> 2 files changed, 39 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index d477a901af84..e5bb46effb6c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2732,6 +2732,12 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
>> if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
>> adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
>>
>> + adev->virt.is_xgmi_node_migrate_enabled = false;
>> + if (amdgpu_sriov_vf(adev)) {
>> + adev->virt.is_xgmi_node_migrate_enabled =
>> + amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4);
>> + }
>> +
>> total = true;
>> for (i = 0; i < adev->num_ip_blocks; i++) {
>> ip_block = &adev->ip_blocks[i];
>> @@ -5040,6 +5046,28 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>> return 0;
>> }
>>
>> +static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
>> +{
>> + int r;
>> + unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
>> +
>> + if (!amdgpu_virt_xgmi_migrate_enabled(adev))
>> + return 0;
>> +
>> + r = adev->gfxhub.funcs->get_xgmi_info(adev);
>> + if (r)
>> + return r;
>> +
>> + dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
>> + prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
>> +
>> + adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev);
>> + adev->vm_manager.vram_base_offset +=
>> + adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
>> +
>> + return 0;
>> +}
>> +
>> /**
>> * amdgpu_device_resume - initiate device resume
>> *
>> @@ -5061,6 +5089,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
>> return r;
>> }
>>
>> + r = amdgpu_virt_resume(adev);
>
> You may restrict this to VF only for now -
> if (amdgpu_sriov_vf(adev)) {
> r = amdgpu_virt_resume(adev);
> ...
> }
>
> Thanks,
> Lijo
>
>> + if (r)
>> + goto exit;
>> +
>> if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
>> return 0;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index df03dba67ab8..532b92628171 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -295,6 +295,9 @@ struct amdgpu_virt {
>> union amd_sriov_ras_caps ras_telemetry_en_caps;
>> struct amdgpu_virt_ras ras;
>> struct amd_sriov_ras_telemetry_error_count count_cache;
>> +
>> + /* hibernate and resume with different VF feature for xgmi enabled system */
>> + bool is_xgmi_node_migrate_enabled;
>> };
>>
>> struct amdgpu_video_codec_info;
>> @@ -376,6 +379,10 @@ static inline bool is_virtual_machine(void)
>> ((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE)
>> #define amdgpu_sriov_is_mes_info_enable(adev) \
>> ((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)
>> +
>> +#define amdgpu_virt_xgmi_migrate_enabled(adev) \
>> + ((adev)->virt.is_xgmi_node_migrate_enabled)
One additional comment - better to add another check like
(adev->gmc.xgmi.node_segment_size !=0) for keeping this only for xgmi.
Thanks,
Lijo
>> +
>> bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
>> void amdgpu_virt_init_setting(struct amdgpu_device *adev);
>> int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v7 2/4] drm/amdgpu: update GPU addresses for SMU and PSP
2025-05-22 3:43 [PATCH v7 0/4] enable xgmi node migration support for hibernate on SRIOV Samuel Zhang
2025-05-22 3:43 ` [PATCH v7 1/4] drm/amdgpu: update xgmi info and vram_base_offset on resume Samuel Zhang
@ 2025-05-22 3:43 ` Samuel Zhang
2025-05-22 7:44 ` Lazar, Lijo
2025-05-22 9:09 ` Christian König
2025-05-22 3:43 ` [PATCH v7 3/4] drm/amdgpu: enable pdb0 for hibernation on SRIOV Samuel Zhang
2025-05-22 3:43 ` [PATCH v7 4/4] drm/amdgpu: fix fence fallback timer expired error Samuel Zhang
3 siblings, 2 replies; 13+ messages in thread
From: Samuel Zhang @ 2025-05-22 3:43 UTC (permalink / raw)
To: amd-gfx
Cc: victor.zhao, haijun.chang, guoqing.zhang, Christian.Koenig,
Alexander.Deucher, Owen.Zhang2, Qing.Ma, Lijo.Lazar, Emily.Deng,
Jiang Liu, Christian König
add amdgpu_bo_fb_aper_addr() and update the cached GPU addresses to use
the FB aperture address for SMU and PSP.
2 reasons for this change:
1. when pdb0 is enabled, gpu addr from amdgpu_bo_create_kernel() is GART
aperture address, it is not compatible with SMU and PSP, it need to be
updated to use FB aperture address.
2. Since FB aperture address will change after switching to new GPU
index after hibernation, it need to be updated on resume.
Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 20 +++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 23 ++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 3 +++
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 18 +++++++++++++++++
5 files changed, 65 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 4e794d546b61..3dde57cd5b81 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1513,6 +1513,26 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo)
return amdgpu_bo_gpu_offset_no_check(bo);
}
+/**
+ * amdgpu_bo_fb_aper_addr - return FB aperture GPU offset of the VRAM bo
+ * @bo: amdgpu VRAM buffer object for which we query the offset
+ *
+ * Returns:
+ * current FB aperture GPU offset of the object.
+ */
+u64 amdgpu_bo_fb_aper_addr(struct amdgpu_bo *bo)
+{
+ struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+ uint64_t offset, fb_base;
+
+ WARN_ON_ONCE(bo->tbo.resource->mem_type != TTM_PL_VRAM);
+
+ fb_base = adev->mmhub.funcs->get_fb_location(adev);
+ fb_base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
+ offset = (bo->tbo.resource->start << PAGE_SHIFT) + fb_base;
+ return amdgpu_gmc_sign_extend(offset);
+}
+
/**
* amdgpu_bo_gpu_offset_no_check - return GPU offset of bo
* @bo: amdgpu object for which we query the offset
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index dcce362bfad3..c8a63e38f5d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -320,6 +320,7 @@ int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct dma_resv *resv,
bool intr);
int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr);
u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
+u64 amdgpu_bo_fb_aper_addr(struct amdgpu_bo *bo);
u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
uint32_t amdgpu_bo_mem_stats_placement(struct amdgpu_bo *bo);
uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index e1e658a97b36..3fc4b8e6fc59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -871,6 +871,8 @@ static int psp_tmr_init(struct psp_context *psp)
&psp->tmr_bo, &psp->tmr_mc_addr,
pptr);
}
+ if (amdgpu_virt_xgmi_migrate_enabled(psp->adev) && psp->tmr_bo)
+ psp->tmr_mc_addr = amdgpu_bo_fb_aper_addr(psp->tmr_bo);
return ret;
}
@@ -1270,6 +1272,11 @@ int psp_ta_load(struct psp_context *psp, struct ta_context *context)
psp_copy_fw(psp, context->bin_desc.start_addr,
context->bin_desc.size_bytes);
+ if (amdgpu_virt_xgmi_migrate_enabled(psp->adev) &&
+ context->mem_context.shared_bo)
+ context->mem_context.shared_mc_addr =
+ amdgpu_bo_fb_aper_addr(context->mem_context.shared_bo);
+
psp_prep_ta_load_cmd_buf(cmd, psp->fw_pri_mc_addr, context);
ret = psp_cmd_submit_buf(psp, NULL, cmd,
@@ -2336,11 +2343,27 @@ bool amdgpu_psp_tos_reload_needed(struct amdgpu_device *adev)
return false;
}
+static void psp_update_gpu_addresses(struct amdgpu_device *adev)
+{
+ struct psp_context *psp = &adev->psp;
+
+ if (psp->cmd_buf_bo && psp->cmd_buf_mem) {
+ psp->fw_pri_mc_addr = amdgpu_bo_fb_aper_addr(psp->fw_pri_bo);
+ psp->fence_buf_mc_addr = amdgpu_bo_fb_aper_addr(psp->fence_buf_bo);
+ psp->cmd_buf_mc_addr = amdgpu_bo_fb_aper_addr(psp->cmd_buf_bo);
+ }
+ if (adev->firmware.rbuf && psp->km_ring.ring_mem)
+ psp->km_ring.ring_mem_mc_addr = amdgpu_bo_fb_aper_addr(adev->firmware.rbuf);
+}
+
static int psp_hw_start(struct psp_context *psp)
{
struct amdgpu_device *adev = psp->adev;
int ret;
+ if (amdgpu_virt_xgmi_migrate_enabled(adev))
+ psp_update_gpu_addresses(adev);
+
if (!amdgpu_sriov_vf(adev)) {
if ((is_psp_fw_valid(psp->kdb)) &&
(psp->funcs->bootloader_load_kdb != NULL)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index 3d9e9fdc10b4..bf9013f8d12e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -1152,6 +1152,9 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
adev->firmware.max_ucodes = AMDGPU_UCODE_ID_MAXIMUM;
}
+ if (amdgpu_virt_xgmi_migrate_enabled(adev) && adev->firmware.fw_buf)
+ adev->firmware.fw_buf_mc = amdgpu_bo_fb_aper_addr(adev->firmware.fw_buf);
+
for (i = 0; i < adev->firmware.max_ucodes; i++) {
ucode = &adev->firmware.ucode[i];
if (ucode->fw) {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 315b0856bf02..f9f49f37dfcd 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1000,6 +1000,21 @@ static int smu_fini_fb_allocations(struct smu_context *smu)
return 0;
}
+static void smu_update_gpu_addresses(struct smu_context *smu)
+{
+ struct smu_table_context *smu_table = &smu->smu_table;
+ struct smu_table *pm_status_table = smu_table->tables + SMU_TABLE_PMSTATUSLOG;
+ struct smu_table *driver_table = &(smu_table->driver_table);
+ struct smu_table *dummy_read_1_table = &smu_table->dummy_read_1_table;
+
+ if (pm_status_table->bo)
+ pm_status_table->mc_address = amdgpu_bo_fb_aper_addr(pm_status_table->bo);
+ if (driver_table->bo)
+ driver_table->mc_address = amdgpu_bo_fb_aper_addr(driver_table->bo);
+ if (dummy_read_1_table->bo)
+ dummy_read_1_table->mc_address = amdgpu_bo_fb_aper_addr(dummy_read_1_table->bo);
+}
+
/**
* smu_alloc_memory_pool - allocate memory pool in the system memory
*
@@ -1789,6 +1804,9 @@ static int smu_start_smc_engine(struct smu_context *smu)
struct amdgpu_device *adev = smu->adev;
int ret = 0;
+ if (amdgpu_virt_xgmi_migrate_enabled(adev))
+ smu_update_gpu_addresses(smu);
+
smu->smc_fw_state = SMU_FW_INIT;
if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH v7 2/4] drm/amdgpu: update GPU addresses for SMU and PSP
2025-05-22 3:43 ` [PATCH v7 2/4] drm/amdgpu: update GPU addresses for SMU and PSP Samuel Zhang
@ 2025-05-22 7:44 ` Lazar, Lijo
2025-05-22 9:09 ` Christian König
1 sibling, 0 replies; 13+ messages in thread
From: Lazar, Lijo @ 2025-05-22 7:44 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Christian.Koenig, Alexander.Deucher,
Owen.Zhang2, Qing.Ma, Emily.Deng, Jiang Liu
On 5/22/2025 9:13 AM, Samuel Zhang wrote:
> add amdgpu_bo_fb_aper_addr() and update the cached GPU addresses to use
> the FB aperture address for SMU and PSP.
>
> 2 reasons for this change:
> 1. when pdb0 is enabled, gpu addr from amdgpu_bo_create_kernel() is GART
> aperture address, it is not compatible with SMU and PSP, it need to be
> updated to use FB aperture address.
> 2. Since FB aperture address will change after switching to new GPU
> index after hibernation, it need to be updated on resume.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Lijo Lazar <lijo.lazar@amd.com>
Thanks,
Lijo
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 20 +++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 23 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 3 +++
> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 18 +++++++++++++++++
> 5 files changed, 65 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 4e794d546b61..3dde57cd5b81 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1513,6 +1513,26 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo)
> return amdgpu_bo_gpu_offset_no_check(bo);
> }
>
> +/**
> + * amdgpu_bo_fb_aper_addr - return FB aperture GPU offset of the VRAM bo
> + * @bo: amdgpu VRAM buffer object for which we query the offset
> + *
> + * Returns:
> + * current FB aperture GPU offset of the object.
> + */
> +u64 amdgpu_bo_fb_aper_addr(struct amdgpu_bo *bo)
> +{
> + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> + uint64_t offset, fb_base;
> +
> + WARN_ON_ONCE(bo->tbo.resource->mem_type != TTM_PL_VRAM);
> +
> + fb_base = adev->mmhub.funcs->get_fb_location(adev);
> + fb_base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
> + offset = (bo->tbo.resource->start << PAGE_SHIFT) + fb_base;
> + return amdgpu_gmc_sign_extend(offset);
> +}
> +
> /**
> * amdgpu_bo_gpu_offset_no_check - return GPU offset of bo
> * @bo: amdgpu object for which we query the offset
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index dcce362bfad3..c8a63e38f5d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -320,6 +320,7 @@ int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct dma_resv *resv,
> bool intr);
> int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr);
> u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
> +u64 amdgpu_bo_fb_aper_addr(struct amdgpu_bo *bo);
> u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
> uint32_t amdgpu_bo_mem_stats_placement(struct amdgpu_bo *bo);
> uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index e1e658a97b36..3fc4b8e6fc59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -871,6 +871,8 @@ static int psp_tmr_init(struct psp_context *psp)
> &psp->tmr_bo, &psp->tmr_mc_addr,
> pptr);
> }
> + if (amdgpu_virt_xgmi_migrate_enabled(psp->adev) && psp->tmr_bo)
> + psp->tmr_mc_addr = amdgpu_bo_fb_aper_addr(psp->tmr_bo);
>
> return ret;
> }
> @@ -1270,6 +1272,11 @@ int psp_ta_load(struct psp_context *psp, struct ta_context *context)
> psp_copy_fw(psp, context->bin_desc.start_addr,
> context->bin_desc.size_bytes);
>
> + if (amdgpu_virt_xgmi_migrate_enabled(psp->adev) &&
> + context->mem_context.shared_bo)
> + context->mem_context.shared_mc_addr =
> + amdgpu_bo_fb_aper_addr(context->mem_context.shared_bo);
> +
> psp_prep_ta_load_cmd_buf(cmd, psp->fw_pri_mc_addr, context);
>
> ret = psp_cmd_submit_buf(psp, NULL, cmd,
> @@ -2336,11 +2343,27 @@ bool amdgpu_psp_tos_reload_needed(struct amdgpu_device *adev)
> return false;
> }
>
> +static void psp_update_gpu_addresses(struct amdgpu_device *adev)
> +{
> + struct psp_context *psp = &adev->psp;
> +
> + if (psp->cmd_buf_bo && psp->cmd_buf_mem) {
> + psp->fw_pri_mc_addr = amdgpu_bo_fb_aper_addr(psp->fw_pri_bo);
> + psp->fence_buf_mc_addr = amdgpu_bo_fb_aper_addr(psp->fence_buf_bo);
> + psp->cmd_buf_mc_addr = amdgpu_bo_fb_aper_addr(psp->cmd_buf_bo);
> + }
> + if (adev->firmware.rbuf && psp->km_ring.ring_mem)
> + psp->km_ring.ring_mem_mc_addr = amdgpu_bo_fb_aper_addr(adev->firmware.rbuf);
> +}
> +
> static int psp_hw_start(struct psp_context *psp)
> {
> struct amdgpu_device *adev = psp->adev;
> int ret;
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev))
> + psp_update_gpu_addresses(adev);
> +
> if (!amdgpu_sriov_vf(adev)) {
> if ((is_psp_fw_valid(psp->kdb)) &&
> (psp->funcs->bootloader_load_kdb != NULL)) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> index 3d9e9fdc10b4..bf9013f8d12e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> @@ -1152,6 +1152,9 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
> adev->firmware.max_ucodes = AMDGPU_UCODE_ID_MAXIMUM;
> }
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev) && adev->firmware.fw_buf)
> + adev->firmware.fw_buf_mc = amdgpu_bo_fb_aper_addr(adev->firmware.fw_buf);
> +
> for (i = 0; i < adev->firmware.max_ucodes; i++) {
> ucode = &adev->firmware.ucode[i];
> if (ucode->fw) {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 315b0856bf02..f9f49f37dfcd 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -1000,6 +1000,21 @@ static int smu_fini_fb_allocations(struct smu_context *smu)
> return 0;
> }
>
> +static void smu_update_gpu_addresses(struct smu_context *smu)
> +{
> + struct smu_table_context *smu_table = &smu->smu_table;
> + struct smu_table *pm_status_table = smu_table->tables + SMU_TABLE_PMSTATUSLOG;
> + struct smu_table *driver_table = &(smu_table->driver_table);
> + struct smu_table *dummy_read_1_table = &smu_table->dummy_read_1_table;
> +
> + if (pm_status_table->bo)
> + pm_status_table->mc_address = amdgpu_bo_fb_aper_addr(pm_status_table->bo);
> + if (driver_table->bo)
> + driver_table->mc_address = amdgpu_bo_fb_aper_addr(driver_table->bo);
> + if (dummy_read_1_table->bo)
> + dummy_read_1_table->mc_address = amdgpu_bo_fb_aper_addr(dummy_read_1_table->bo);
> +}
> +
> /**
> * smu_alloc_memory_pool - allocate memory pool in the system memory
> *
> @@ -1789,6 +1804,9 @@ static int smu_start_smc_engine(struct smu_context *smu)
> struct amdgpu_device *adev = smu->adev;
> int ret = 0;
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev))
> + smu_update_gpu_addresses(smu);
> +
> smu->smc_fw_state = SMU_FW_INIT;
>
> if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v7 2/4] drm/amdgpu: update GPU addresses for SMU and PSP
2025-05-22 3:43 ` [PATCH v7 2/4] drm/amdgpu: update GPU addresses for SMU and PSP Samuel Zhang
2025-05-22 7:44 ` Lazar, Lijo
@ 2025-05-22 9:09 ` Christian König
1 sibling, 0 replies; 13+ messages in thread
From: Christian König @ 2025-05-22 9:09 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Alexander.Deucher, Owen.Zhang2,
Qing.Ma, Lijo.Lazar, Emily.Deng, Jiang Liu
On 5/22/25 05:43, Samuel Zhang wrote:
> add amdgpu_bo_fb_aper_addr() and update the cached GPU addresses to use
> the FB aperture address for SMU and PSP.
>
> 2 reasons for this change:
> 1. when pdb0 is enabled, gpu addr from amdgpu_bo_create_kernel() is GART
> aperture address, it is not compatible with SMU and PSP, it need to be
> updated to use FB aperture address.
> 2. Since FB aperture address will change after switching to new GPU
> index after hibernation, it need to be updated on resume.
>
> Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> Acked-by: Christian König <christian.koenig@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 20 +++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 23 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 3 +++
> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 18 +++++++++++++++++
> 5 files changed, 65 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 4e794d546b61..3dde57cd5b81 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -1513,6 +1513,26 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo)
> return amdgpu_bo_gpu_offset_no_check(bo);
> }
>
> +/**
> + * amdgpu_bo_fb_aper_addr - return FB aperture GPU offset of the VRAM bo
> + * @bo: amdgpu VRAM buffer object for which we query the offset
> + *
> + * Returns:
> + * current FB aperture GPU offset of the object.
> + */
> +u64 amdgpu_bo_fb_aper_addr(struct amdgpu_bo *bo)
> +{
> + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> + uint64_t offset, fb_base;
> +
> + WARN_ON_ONCE(bo->tbo.resource->mem_type != TTM_PL_VRAM);
> +
> + fb_base = adev->mmhub.funcs->get_fb_location(adev);
As discussed in the other mail thread this should probably be changed to use the cached fb_start.
Regards,
Christian.
> + fb_base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
> + offset = (bo->tbo.resource->start << PAGE_SHIFT) + fb_base;
> + return amdgpu_gmc_sign_extend(offset);
> +}
> +
> /**
> * amdgpu_bo_gpu_offset_no_check - return GPU offset of bo
> * @bo: amdgpu object for which we query the offset
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index dcce362bfad3..c8a63e38f5d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -320,6 +320,7 @@ int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct dma_resv *resv,
> bool intr);
> int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr);
> u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
> +u64 amdgpu_bo_fb_aper_addr(struct amdgpu_bo *bo);
> u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
> uint32_t amdgpu_bo_mem_stats_placement(struct amdgpu_bo *bo);
> uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index e1e658a97b36..3fc4b8e6fc59 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -871,6 +871,8 @@ static int psp_tmr_init(struct psp_context *psp)
> &psp->tmr_bo, &psp->tmr_mc_addr,
> pptr);
> }
> + if (amdgpu_virt_xgmi_migrate_enabled(psp->adev) && psp->tmr_bo)
> + psp->tmr_mc_addr = amdgpu_bo_fb_aper_addr(psp->tmr_bo);
>
> return ret;
> }
> @@ -1270,6 +1272,11 @@ int psp_ta_load(struct psp_context *psp, struct ta_context *context)
> psp_copy_fw(psp, context->bin_desc.start_addr,
> context->bin_desc.size_bytes);
>
> + if (amdgpu_virt_xgmi_migrate_enabled(psp->adev) &&
> + context->mem_context.shared_bo)
> + context->mem_context.shared_mc_addr =
> + amdgpu_bo_fb_aper_addr(context->mem_context.shared_bo);
> +
> psp_prep_ta_load_cmd_buf(cmd, psp->fw_pri_mc_addr, context);
>
> ret = psp_cmd_submit_buf(psp, NULL, cmd,
> @@ -2336,11 +2343,27 @@ bool amdgpu_psp_tos_reload_needed(struct amdgpu_device *adev)
> return false;
> }
>
> +static void psp_update_gpu_addresses(struct amdgpu_device *adev)
> +{
> + struct psp_context *psp = &adev->psp;
> +
> + if (psp->cmd_buf_bo && psp->cmd_buf_mem) {
> + psp->fw_pri_mc_addr = amdgpu_bo_fb_aper_addr(psp->fw_pri_bo);
> + psp->fence_buf_mc_addr = amdgpu_bo_fb_aper_addr(psp->fence_buf_bo);
> + psp->cmd_buf_mc_addr = amdgpu_bo_fb_aper_addr(psp->cmd_buf_bo);
> + }
> + if (adev->firmware.rbuf && psp->km_ring.ring_mem)
> + psp->km_ring.ring_mem_mc_addr = amdgpu_bo_fb_aper_addr(adev->firmware.rbuf);
> +}
> +
> static int psp_hw_start(struct psp_context *psp)
> {
> struct amdgpu_device *adev = psp->adev;
> int ret;
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev))
> + psp_update_gpu_addresses(adev);
> +
> if (!amdgpu_sriov_vf(adev)) {
> if ((is_psp_fw_valid(psp->kdb)) &&
> (psp->funcs->bootloader_load_kdb != NULL)) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> index 3d9e9fdc10b4..bf9013f8d12e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> @@ -1152,6 +1152,9 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
> adev->firmware.max_ucodes = AMDGPU_UCODE_ID_MAXIMUM;
> }
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev) && adev->firmware.fw_buf)
> + adev->firmware.fw_buf_mc = amdgpu_bo_fb_aper_addr(adev->firmware.fw_buf);
> +
> for (i = 0; i < adev->firmware.max_ucodes; i++) {
> ucode = &adev->firmware.ucode[i];
> if (ucode->fw) {
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 315b0856bf02..f9f49f37dfcd 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -1000,6 +1000,21 @@ static int smu_fini_fb_allocations(struct smu_context *smu)
> return 0;
> }
>
> +static void smu_update_gpu_addresses(struct smu_context *smu)
> +{
> + struct smu_table_context *smu_table = &smu->smu_table;
> + struct smu_table *pm_status_table = smu_table->tables + SMU_TABLE_PMSTATUSLOG;
> + struct smu_table *driver_table = &(smu_table->driver_table);
> + struct smu_table *dummy_read_1_table = &smu_table->dummy_read_1_table;
> +
> + if (pm_status_table->bo)
> + pm_status_table->mc_address = amdgpu_bo_fb_aper_addr(pm_status_table->bo);
> + if (driver_table->bo)
> + driver_table->mc_address = amdgpu_bo_fb_aper_addr(driver_table->bo);
> + if (dummy_read_1_table->bo)
> + dummy_read_1_table->mc_address = amdgpu_bo_fb_aper_addr(dummy_read_1_table->bo);
> +}
> +
> /**
> * smu_alloc_memory_pool - allocate memory pool in the system memory
> *
> @@ -1789,6 +1804,9 @@ static int smu_start_smc_engine(struct smu_context *smu)
> struct amdgpu_device *adev = smu->adev;
> int ret = 0;
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev))
> + smu_update_gpu_addresses(smu);
> +
> smu->smc_fw_state = SMU_FW_INIT;
>
> if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v7 3/4] drm/amdgpu: enable pdb0 for hibernation on SRIOV
2025-05-22 3:43 [PATCH v7 0/4] enable xgmi node migration support for hibernate on SRIOV Samuel Zhang
2025-05-22 3:43 ` [PATCH v7 1/4] drm/amdgpu: update xgmi info and vram_base_offset on resume Samuel Zhang
2025-05-22 3:43 ` [PATCH v7 2/4] drm/amdgpu: update GPU addresses for SMU and PSP Samuel Zhang
@ 2025-05-22 3:43 ` Samuel Zhang
2025-05-22 7:46 ` Lazar, Lijo
2025-05-22 9:25 ` Christian König
2025-05-22 3:43 ` [PATCH v7 4/4] drm/amdgpu: fix fence fallback timer expired error Samuel Zhang
3 siblings, 2 replies; 13+ messages in thread
From: Samuel Zhang @ 2025-05-22 3:43 UTC (permalink / raw)
To: amd-gfx
Cc: victor.zhao, haijun.chang, guoqing.zhang, Christian.Koenig,
Alexander.Deucher, Owen.Zhang2, Qing.Ma, Lijo.Lazar, Emily.Deng
When switching to new GPU index after hibernation and then resume,
VRAM offset of each VRAM BO will be changed, and the cached gpu
addresses needed to updated.
This is to enable pdb0 and switch to use pdb0-based virtual gpu
address by default in amdgpu_bo_create_reserved(). since the virtual
addresses do not change, this can avoid the need to update all
cached gpu addresses all over the codebase.
Signed-off-by: Emily Deng <Emily.Deng@amd.com>
Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 23 ++++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 2 +-
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 13 ++++++++----
4 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e5bb46effb6c..97389168c90f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5065,6 +5065,10 @@ static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
adev->vm_manager.vram_base_offset +=
adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
+ /* fb_start/fb_end may be reset in get_fb_location(), set it again on resume */
+ adev->gmc.fb_start = 0;
+ adev->gmc.fb_end = adev->gmc.xgmi.node_segment_size * adev->gmc.xgmi.num_physical_nodes - 1;
+
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index d1fa5e8e3937..35df4be6ef2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -38,6 +38,8 @@
#include <drm/drm_drv.h>
#include <drm/ttm/ttm_tt.h>
+static const u64 four_gb = 0x100000000ULL;
+
/**
* amdgpu_gmc_pdb0_alloc - allocate vram for pdb0
*
@@ -251,7 +253,17 @@ void amdgpu_gmc_sysvm_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc
u64 hive_vram_end = mc->xgmi.node_segment_size * mc->xgmi.num_physical_nodes - 1;
mc->vram_start = mc->xgmi.node_segment_size * mc->xgmi.physical_node_id;
mc->vram_end = mc->vram_start + mc->xgmi.node_segment_size - 1;
- mc->gart_start = hive_vram_end + 1;
+ if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
+ /* set mc->vram_start to 0 to switch the returned GPU address of
+ * amdgpu_bo_create_reserved() from FB aperture to GART aperture.
+ */
+ mc->vram_start = 0;
+ mc->vram_end = mc->vram_start + mc->mc_vram_size - 1;
+ if (mc->real_vram_size < mc->visible_vram_size)
+ mc->visible_vram_size = mc->real_vram_size;
+ }
+ /* node_segment_size may not 4GB aligned on SRIOV, align up is needed. */
+ mc->gart_start = ALIGN(hive_vram_end + 1, four_gb);
mc->gart_end = mc->gart_start + mc->gart_size - 1;
mc->fb_start = hive_vram_start;
mc->fb_end = hive_vram_end;
@@ -276,7 +288,6 @@ void amdgpu_gmc_sysvm_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc
void amdgpu_gmc_gart_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc,
enum amdgpu_gart_placement gart_placement)
{
- const uint64_t four_gb = 0x100000000ULL;
u64 size_af, size_bf;
/*To avoid the hole, limit the max mc address to AMDGPU_GMC_HOLE_START*/
u64 max_mc_address = min(adev->gmc.mc_mask, AMDGPU_GMC_HOLE_START - 1);
@@ -1068,6 +1079,14 @@ void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev)
flags |= AMDGPU_PTE_FRAG((adev->gmc.vmid0_page_table_block_size + 9*1));
flags |= AMDGPU_PDE_PTE_FLAG(adev);
+ if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
+ /* always start from current device so that the GART address can keep
+ * consistent when hibernate-resume with different GPUs.
+ */
+ vram_addr = adev->vm_manager.vram_base_offset;
+ vram_end = vram_addr + vram_size;
+ }
+
/* The first n PDE0 entries are used as PTE,
* pointing to vram
*/
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index cb25f7f0dfc1..e6165f6d0763 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -180,7 +180,7 @@ gfxhub_v1_2_xcc_init_system_aperture_regs(struct amdgpu_device *adev,
/* In the case squeezing vram into GART aperture, we don't use
* FB aperture and AGP aperture. Disable them.
*/
- if (adev->gmc.pdb0_bo) {
+ if (adev->gmc.pdb0_bo && !amdgpu_virt_xgmi_migrate_enabled(adev)) {
WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_FB_LOCATION_TOP, 0);
WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_FB_LOCATION_BASE, 0x00FFFFFF);
WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_AGP_TOP, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 59385da80185..23d02965ad65 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -413,6 +413,11 @@ static const uint32_t ecc_umc_mcumc_ctrl_mask_addrs[] = {
(0x001d43e0 + 0x00001800),
};
+static inline bool gmc_v9_0_is_pdb0_enabled(struct amdgpu_device *adev)
+{
+ return adev->gmc.xgmi.connected_to_cpu || amdgpu_virt_xgmi_migrate_enabled(adev);
+}
+
static inline bool gmc_v9_0_is_multi_chiplet(struct amdgpu_device *adev)
{
return !!adev->aid_mask;
@@ -1726,7 +1731,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
/* add the xgmi offset of the physical node */
base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
- if (adev->gmc.xgmi.connected_to_cpu) {
+ if (gmc_v9_0_is_pdb0_enabled(adev)) {
amdgpu_gmc_sysvm_location(adev, mc);
} else {
amdgpu_gmc_vram_location(adev, mc, base);
@@ -1841,7 +1846,7 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev)
return 0;
}
- if (adev->gmc.xgmi.connected_to_cpu) {
+ if (gmc_v9_0_is_pdb0_enabled(adev)) {
adev->gmc.vmid0_page_table_depth = 1;
adev->gmc.vmid0_page_table_block_size = 12;
} else {
@@ -1867,7 +1872,7 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev)
if (r)
return r;
- if (adev->gmc.xgmi.connected_to_cpu)
+ if (gmc_v9_0_is_pdb0_enabled(adev))
r = amdgpu_gmc_pdb0_alloc(adev);
}
@@ -2372,7 +2377,7 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device *adev)
{
int r;
- if (adev->gmc.xgmi.connected_to_cpu)
+ if (gmc_v9_0_is_pdb0_enabled(adev))
amdgpu_gmc_init_pdb0(adev);
if (adev->gart.bo == NULL) {
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH v7 3/4] drm/amdgpu: enable pdb0 for hibernation on SRIOV
2025-05-22 3:43 ` [PATCH v7 3/4] drm/amdgpu: enable pdb0 for hibernation on SRIOV Samuel Zhang
@ 2025-05-22 7:46 ` Lazar, Lijo
2025-05-22 9:25 ` Christian König
1 sibling, 0 replies; 13+ messages in thread
From: Lazar, Lijo @ 2025-05-22 7:46 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Christian.Koenig, Alexander.Deucher,
Owen.Zhang2, Qing.Ma, Emily.Deng
On 5/22/2025 9:13 AM, Samuel Zhang wrote:
> When switching to new GPU index after hibernation and then resume,
> VRAM offset of each VRAM BO will be changed, and the cached gpu
> addresses needed to updated.
>
> This is to enable pdb0 and switch to use pdb0-based virtual gpu
> address by default in amdgpu_bo_create_reserved(). since the virtual
> addresses do not change, this can avoid the need to update all
> cached gpu addresses all over the codebase.
>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 23 ++++++++++++++++++++--
> drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 13 ++++++++----
> 4 files changed, 35 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e5bb46effb6c..97389168c90f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5065,6 +5065,10 @@ static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> adev->vm_manager.vram_base_offset +=
> adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
>
> + /* fb_start/fb_end may be reset in get_fb_location(), set it again on resume */
> + adev->gmc.fb_start = 0;
> + adev->gmc.fb_end = adev->gmc.xgmi.node_segment_size * adev->gmc.xgmi.num_physical_nodes - 1;
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index d1fa5e8e3937..35df4be6ef2a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -38,6 +38,8 @@
> #include <drm/drm_drv.h>
> #include <drm/ttm/ttm_tt.h>
>
> +static const u64 four_gb = 0x100000000ULL;
> +
> /**
> * amdgpu_gmc_pdb0_alloc - allocate vram for pdb0
> *
> @@ -251,7 +253,17 @@ void amdgpu_gmc_sysvm_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc
> u64 hive_vram_end = mc->xgmi.node_segment_size * mc->xgmi.num_physical_nodes - 1;
> mc->vram_start = mc->xgmi.node_segment_size * mc->xgmi.physical_node_id;
> mc->vram_end = mc->vram_start + mc->xgmi.node_segment_size - 1;
> - mc->gart_start = hive_vram_end + 1;
> + if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
> + /* set mc->vram_start to 0 to switch the returned GPU address of
> + * amdgpu_bo_create_reserved() from FB aperture to GART aperture.
> + */
> + mc->vram_start = 0;
> + mc->vram_end = mc->vram_start + mc->mc_vram_size - 1;
> + if (mc->real_vram_size < mc->visible_vram_size)
> + mc->visible_vram_size = mc->real_vram_size;
> + }
> + /* node_segment_size may not 4GB aligned on SRIOV, align up is needed. */
> + mc->gart_start = ALIGN(hive_vram_end + 1, four_gb);
> mc->gart_end = mc->gart_start + mc->gart_size - 1;
> mc->fb_start = hive_vram_start;
> mc->fb_end = hive_vram_end;
> @@ -276,7 +288,6 @@ void amdgpu_gmc_sysvm_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc
> void amdgpu_gmc_gart_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc,
> enum amdgpu_gart_placement gart_placement)
> {
> - const uint64_t four_gb = 0x100000000ULL;
> u64 size_af, size_bf;
> /*To avoid the hole, limit the max mc address to AMDGPU_GMC_HOLE_START*/
> u64 max_mc_address = min(adev->gmc.mc_mask, AMDGPU_GMC_HOLE_START - 1);
> @@ -1068,6 +1079,14 @@ void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev)
> flags |= AMDGPU_PTE_FRAG((adev->gmc.vmid0_page_table_block_size + 9*1));
> flags |= AMDGPU_PDE_PTE_FLAG(adev);
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
> + /* always start from current device so that the GART address can keep
> + * consistent when hibernate-resume with different GPUs.
> + */
> + vram_addr = adev->vm_manager.vram_base_offset;
> + vram_end = vram_addr + vram_size;
> + }
> +
> /* The first n PDE0 entries are used as PTE,
> * pointing to vram
> */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
> index cb25f7f0dfc1..e6165f6d0763 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
> @@ -180,7 +180,7 @@ gfxhub_v1_2_xcc_init_system_aperture_regs(struct amdgpu_device *adev,
> /* In the case squeezing vram into GART aperture, we don't use
> * FB aperture and AGP aperture. Disable them.
> */
> - if (adev->gmc.pdb0_bo) {
> + if (adev->gmc.pdb0_bo && !amdgpu_virt_xgmi_migrate_enabled(adev)) {
> WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_FB_LOCATION_TOP, 0);
> WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_FB_LOCATION_BASE, 0x00FFFFFF);
> WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_AGP_TOP, 0);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 59385da80185..23d02965ad65 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -413,6 +413,11 @@ static const uint32_t ecc_umc_mcumc_ctrl_mask_addrs[] = {
> (0x001d43e0 + 0x00001800),
> };
>
> +static inline bool gmc_v9_0_is_pdb0_enabled(struct amdgpu_device *adev)
This may be kept generic -> amdgpu_gmc_is_pdb0_enabled(adev)
Thanks,
Lijo
> +{
> + return adev->gmc.xgmi.connected_to_cpu || amdgpu_virt_xgmi_migrate_enabled(adev);
> +}
> +
> static inline bool gmc_v9_0_is_multi_chiplet(struct amdgpu_device *adev)
> {
> return !!adev->aid_mask;
> @@ -1726,7 +1731,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
>
> /* add the xgmi offset of the physical node */
> base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
> - if (adev->gmc.xgmi.connected_to_cpu) {
> + if (gmc_v9_0_is_pdb0_enabled(adev)) {
> amdgpu_gmc_sysvm_location(adev, mc);
> } else {
> amdgpu_gmc_vram_location(adev, mc, base);
> @@ -1841,7 +1846,7 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev)
> return 0;
> }
>
> - if (adev->gmc.xgmi.connected_to_cpu) {
> + if (gmc_v9_0_is_pdb0_enabled(adev)) {
> adev->gmc.vmid0_page_table_depth = 1;
> adev->gmc.vmid0_page_table_block_size = 12;
> } else {
> @@ -1867,7 +1872,7 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev)
> if (r)
> return r;
>
> - if (adev->gmc.xgmi.connected_to_cpu)
> + if (gmc_v9_0_is_pdb0_enabled(adev))
> r = amdgpu_gmc_pdb0_alloc(adev);
> }
>
> @@ -2372,7 +2377,7 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device *adev)
> {
> int r;
>
> - if (adev->gmc.xgmi.connected_to_cpu)
> + if (gmc_v9_0_is_pdb0_enabled(adev))
> amdgpu_gmc_init_pdb0(adev);
>
> if (adev->gart.bo == NULL) {
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v7 3/4] drm/amdgpu: enable pdb0 for hibernation on SRIOV
2025-05-22 3:43 ` [PATCH v7 3/4] drm/amdgpu: enable pdb0 for hibernation on SRIOV Samuel Zhang
2025-05-22 7:46 ` Lazar, Lijo
@ 2025-05-22 9:25 ` Christian König
1 sibling, 0 replies; 13+ messages in thread
From: Christian König @ 2025-05-22 9:25 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Alexander.Deucher, Owen.Zhang2,
Qing.Ma, Lijo.Lazar, Emily.Deng
On 5/22/25 05:43, Samuel Zhang wrote:
> When switching to new GPU index after hibernation and then resume,
> VRAM offset of each VRAM BO will be changed, and the cached gpu
> addresses needed to updated.
>
> This is to enable pdb0 and switch to use pdb0-based virtual gpu
> address by default in amdgpu_bo_create_reserved(). since the virtual
> addresses do not change, this can avoid the need to update all
> cached gpu addresses all over the codebase.
>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 23 ++++++++++++++++++++--
> drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 13 ++++++++----
> 4 files changed, 35 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e5bb46effb6c..97389168c90f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5065,6 +5065,10 @@ static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> adev->vm_manager.vram_base_offset +=
> adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
>
> + /* fb_start/fb_end may be reset in get_fb_location(), set it again on resume */
> + adev->gmc.fb_start = 0;
> + adev->gmc.fb_end = adev->gmc.xgmi.node_segment_size * adev->gmc.xgmi.num_physical_nodes - 1;
Why do we need that in the first place? Updating the fb_start/end in get_fb_location() is actually fine.
Or are the fb_start/fb_end values still used anywhere they shouldn't?
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index d1fa5e8e3937..35df4be6ef2a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -38,6 +38,8 @@
> #include <drm/drm_drv.h>
> #include <drm/ttm/ttm_tt.h>
>
> +static const u64 four_gb = 0x100000000ULL;
> +
> /**
> * amdgpu_gmc_pdb0_alloc - allocate vram for pdb0
> *
> @@ -251,7 +253,17 @@ void amdgpu_gmc_sysvm_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc
> u64 hive_vram_end = mc->xgmi.node_segment_size * mc->xgmi.num_physical_nodes - 1;
> mc->vram_start = mc->xgmi.node_segment_size * mc->xgmi.physical_node_id;
> mc->vram_end = mc->vram_start + mc->xgmi.node_segment_size - 1;
> - mc->gart_start = hive_vram_end + 1;
> + if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
> + /* set mc->vram_start to 0 to switch the returned GPU address of
> + * amdgpu_bo_create_reserved() from FB aperture to GART aperture.
> + */
> + mc->vram_start = 0;
> + mc->vram_end = mc->vram_start + mc->mc_vram_size - 1;
> + if (mc->real_vram_size < mc->visible_vram_size)
> + mc->visible_vram_size = mc->real_vram_size;
Make that here mc->visible_vram_size = min(mc->visible_vram_size, mc->real_vram_size).
> + }
> + /* node_segment_size may not 4GB aligned on SRIOV, align up is needed. */
> + mc->gart_start = ALIGN(hive_vram_end + 1, four_gb);
> mc->gart_end = mc->gart_start + mc->gart_size - 1;
> mc->fb_start = hive_vram_start;
> mc->fb_end = hive_vram_end;
> @@ -276,7 +288,6 @@ void amdgpu_gmc_sysvm_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc
> void amdgpu_gmc_gart_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc,
> enum amdgpu_gart_placement gart_placement)
> {
> - const uint64_t four_gb = 0x100000000ULL;
> u64 size_af, size_bf;
> /*To avoid the hole, limit the max mc address to AMDGPU_GMC_HOLE_START*/
> u64 max_mc_address = min(adev->gmc.mc_mask, AMDGPU_GMC_HOLE_START - 1);
> @@ -1068,6 +1079,14 @@ void amdgpu_gmc_init_pdb0(struct amdgpu_device *adev)
> flags |= AMDGPU_PTE_FRAG((adev->gmc.vmid0_page_table_block_size + 9*1));
> flags |= AMDGPU_PDE_PTE_FLAG(adev);
>
> + if (amdgpu_virt_xgmi_migrate_enabled(adev)) {
> + /* always start from current device so that the GART address can keep
> + * consistent when hibernate-resume with different GPUs.
> + */
> + vram_addr = adev->vm_manager.vram_base_offset;
> + vram_end = vram_addr + vram_size;
> + }
> +
Please make an else branch here for the XGMI connected GPU case, or maybe code it like this:
/*
* ....
*/
if (!amdgpu_virt_xgmi_migrate_enabled(adev))
vram_addr -= adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size
vram_end = vram_start + vram_size;
> + vram_addr = adev->vm_manager.vram_base_offset;
> + vram_end = vram_addr + vram_size;
> + }
>
> /* The first n PDE0 entries are used as PTE,
> * pointing to vram
> */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
> index cb25f7f0dfc1..e6165f6d0763 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
> @@ -180,7 +180,7 @@ gfxhub_v1_2_xcc_init_system_aperture_regs(struct amdgpu_device *adev,
> /* In the case squeezing vram into GART aperture, we don't use
> * FB aperture and AGP aperture. Disable them.
> */
> - if (adev->gmc.pdb0_bo) {
> + if (adev->gmc.pdb0_bo && !amdgpu_virt_xgmi_migrate_enabled(adev)) {
Better check for adev->gmc.xgmi.connected_to_cpu here.
Regards,
Christian.
> WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_FB_LOCATION_TOP, 0);
> WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_FB_LOCATION_BASE, 0x00FFFFFF);
> WREG32_SOC15(GC, GET_INST(GC, i), regMC_VM_AGP_TOP, 0);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 59385da80185..23d02965ad65 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -413,6 +413,11 @@ static const uint32_t ecc_umc_mcumc_ctrl_mask_addrs[] = {
> (0x001d43e0 + 0x00001800),
> };
>
> +static inline bool gmc_v9_0_is_pdb0_enabled(struct amdgpu_device *adev)
> +{
> + return adev->gmc.xgmi.connected_to_cpu || amdgpu_virt_xgmi_migrate_enabled(adev);
> +}
> +
> static inline bool gmc_v9_0_is_multi_chiplet(struct amdgpu_device *adev)
> {
> return !!adev->aid_mask;
> @@ -1726,7 +1731,7 @@ static void gmc_v9_0_vram_gtt_location(struct amdgpu_device *adev,
>
> /* add the xgmi offset of the physical node */
> base += adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size;
> - if (adev->gmc.xgmi.connected_to_cpu) {
> + if (gmc_v9_0_is_pdb0_enabled(adev)) {
> amdgpu_gmc_sysvm_location(adev, mc);
> } else {
> amdgpu_gmc_vram_location(adev, mc, base);
> @@ -1841,7 +1846,7 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev)
> return 0;
> }
>
> - if (adev->gmc.xgmi.connected_to_cpu) {
> + if (gmc_v9_0_is_pdb0_enabled(adev)) {
> adev->gmc.vmid0_page_table_depth = 1;
> adev->gmc.vmid0_page_table_block_size = 12;
> } else {
> @@ -1867,7 +1872,7 @@ static int gmc_v9_0_gart_init(struct amdgpu_device *adev)
> if (r)
> return r;
>
> - if (adev->gmc.xgmi.connected_to_cpu)
> + if (gmc_v9_0_is_pdb0_enabled(adev))
> r = amdgpu_gmc_pdb0_alloc(adev);
> }
>
> @@ -2372,7 +2377,7 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device *adev)
> {
> int r;
>
> - if (adev->gmc.xgmi.connected_to_cpu)
> + if (gmc_v9_0_is_pdb0_enabled(adev))
> amdgpu_gmc_init_pdb0(adev);
>
> if (adev->gart.bo == NULL) {
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH v7 4/4] drm/amdgpu: fix fence fallback timer expired error
2025-05-22 3:43 [PATCH v7 0/4] enable xgmi node migration support for hibernate on SRIOV Samuel Zhang
` (2 preceding siblings ...)
2025-05-22 3:43 ` [PATCH v7 3/4] drm/amdgpu: enable pdb0 for hibernation on SRIOV Samuel Zhang
@ 2025-05-22 3:43 ` Samuel Zhang
2025-05-22 7:48 ` Lazar, Lijo
2025-05-22 9:27 ` Christian König
3 siblings, 2 replies; 13+ messages in thread
From: Samuel Zhang @ 2025-05-22 3:43 UTC (permalink / raw)
To: amd-gfx
Cc: victor.zhao, haijun.chang, guoqing.zhang, Christian.Koenig,
Alexander.Deucher, Owen.Zhang2, Qing.Ma, Lijo.Lazar, Emily.Deng
IH is not working after switching a new gpu index for the first time.
The msix table in virtual machine is faked. The real msix table will be
programmed by QEMU when guest enable/disable msix interrupt. But QEMU
accessing VF msix table (register GFXMSIX_VECT0_ADDR_LO) is blocked
by nBIF protection if the VF isn't in exclusive access at that time.
call amdgpu_restore_msix on resume to restore msix table.
Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h | 1 +
3 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 97389168c90f..1f38ff1e42d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5051,6 +5051,9 @@ static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
int r;
unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
+ if (amdgpu_sriov_vf(adev))
+ amdgpu_restore_msix(adev);
+
if (!amdgpu_virt_xgmi_migrate_enabled(adev))
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 0e890f2785b1..f080354efec8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -245,7 +245,7 @@ static bool amdgpu_msi_ok(struct amdgpu_device *adev)
return true;
}
-static void amdgpu_restore_msix(struct amdgpu_device *adev)
+void amdgpu_restore_msix(struct amdgpu_device *adev)
{
u16 ctrl;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
index aef5c216b191..f52bd7e6d988 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
@@ -149,5 +149,6 @@ void amdgpu_irq_gpu_reset_resume_helper(struct amdgpu_device *adev);
int amdgpu_irq_add_domain(struct amdgpu_device *adev);
void amdgpu_irq_remove_domain(struct amdgpu_device *adev);
unsigned amdgpu_irq_create_mapping(struct amdgpu_device *adev, unsigned src_id);
+void amdgpu_restore_msix(struct amdgpu_device *adev);
#endif
--
2.43.5
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH v7 4/4] drm/amdgpu: fix fence fallback timer expired error
2025-05-22 3:43 ` [PATCH v7 4/4] drm/amdgpu: fix fence fallback timer expired error Samuel Zhang
@ 2025-05-22 7:48 ` Lazar, Lijo
2025-05-22 9:27 ` Christian König
1 sibling, 0 replies; 13+ messages in thread
From: Lazar, Lijo @ 2025-05-22 7:48 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Christian.Koenig, Alexander.Deucher,
Owen.Zhang2, Qing.Ma, Emily.Deng
On 5/22/2025 9:13 AM, Samuel Zhang wrote:
> IH is not working after switching a new gpu index for the first time.
>
> The msix table in virtual machine is faked. The real msix table will be
> programmed by QEMU when guest enable/disable msix interrupt. But QEMU
> accessing VF msix table (register GFXMSIX_VECT0_ADDR_LO) is blocked
> by nBIF protection if the VF isn't in exclusive access at that time.
>
> call amdgpu_restore_msix on resume to restore msix table.
>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h | 1 +
> 3 files changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 97389168c90f..1f38ff1e42d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5051,6 +5051,9 @@ static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> int r;
> unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
>
> + if (amdgpu_sriov_vf(adev))
This check won't be required if amdgpu_virt_resume() is restricted only
for VFs.
Thanks,
Lijo
> + amdgpu_restore_msix(adev);
> +
> if (!amdgpu_virt_xgmi_migrate_enabled(adev))
> return 0;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 0e890f2785b1..f080354efec8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -245,7 +245,7 @@ static bool amdgpu_msi_ok(struct amdgpu_device *adev)
> return true;
> }
>
> -static void amdgpu_restore_msix(struct amdgpu_device *adev)
> +void amdgpu_restore_msix(struct amdgpu_device *adev)
> {
> u16 ctrl;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
> index aef5c216b191..f52bd7e6d988 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
> @@ -149,5 +149,6 @@ void amdgpu_irq_gpu_reset_resume_helper(struct amdgpu_device *adev);
> int amdgpu_irq_add_domain(struct amdgpu_device *adev);
> void amdgpu_irq_remove_domain(struct amdgpu_device *adev);
> unsigned amdgpu_irq_create_mapping(struct amdgpu_device *adev, unsigned src_id);
> +void amdgpu_restore_msix(struct amdgpu_device *adev);
>
> #endif
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH v7 4/4] drm/amdgpu: fix fence fallback timer expired error
2025-05-22 3:43 ` [PATCH v7 4/4] drm/amdgpu: fix fence fallback timer expired error Samuel Zhang
2025-05-22 7:48 ` Lazar, Lijo
@ 2025-05-22 9:27 ` Christian König
1 sibling, 0 replies; 13+ messages in thread
From: Christian König @ 2025-05-22 9:27 UTC (permalink / raw)
To: Samuel Zhang, amd-gfx
Cc: victor.zhao, haijun.chang, Alexander.Deucher, Owen.Zhang2,
Qing.Ma, Lijo.Lazar, Emily.Deng
On 5/22/25 05:43, Samuel Zhang wrote:
> IH is not working after switching a new gpu index for the first time.
>
> The msix table in virtual machine is faked. The real msix table will be
> programmed by QEMU when guest enable/disable msix interrupt. But QEMU
> accessing VF msix table (register GFXMSIX_VECT0_ADDR_LO) is blocked
> by nBIF protection if the VF isn't in exclusive access at that time.
That explanation need to be a code comment above the call, apart from that looks reasonable to me but I'm not an expert for that stuff.
Alex and/or Lijo should probably take a look as well.
Regards,
Christian.
>
> call amdgpu_restore_msix on resume to restore msix table.
>
> Signed-off-by: Samuel Zhang <guoqing.zhang@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h | 1 +
> 3 files changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 97389168c90f..1f38ff1e42d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5051,6 +5051,9 @@ static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> int r;
> unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
>
> + if (amdgpu_sriov_vf(adev))
> + amdgpu_restore_msix(adev);
> +
> if (!amdgpu_virt_xgmi_migrate_enabled(adev))
> return 0;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 0e890f2785b1..f080354efec8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -245,7 +245,7 @@ static bool amdgpu_msi_ok(struct amdgpu_device *adev)
> return true;
> }
>
> -static void amdgpu_restore_msix(struct amdgpu_device *adev)
> +void amdgpu_restore_msix(struct amdgpu_device *adev)
> {
> u16 ctrl;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
> index aef5c216b191..f52bd7e6d988 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.h
> @@ -149,5 +149,6 @@ void amdgpu_irq_gpu_reset_resume_helper(struct amdgpu_device *adev);
> int amdgpu_irq_add_domain(struct amdgpu_device *adev);
> void amdgpu_irq_remove_domain(struct amdgpu_device *adev);
> unsigned amdgpu_irq_create_mapping(struct amdgpu_device *adev, unsigned src_id);
> +void amdgpu_restore_msix(struct amdgpu_device *adev);
>
> #endif
^ permalink raw reply [flat|nested] 13+ messages in thread