AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: "Lazar, Lijo" <lijo.lazar@amd.com>
To: Ellen Pan <yunru.pan@amd.com>, amd-gfx@lists.freedesktop.org
Cc: Alexander.Deucher@amd.com, Christian.Koenig@amd.com,
	Jeffrey.Chan@amd.com
Subject: Re: [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init
Date: Thu, 16 Oct 2025 18:36:59 +0530	[thread overview]
Message-ID: <00e829e7-9826-49a6-93db-630abfbb6ed5@amd.com> (raw)
In-Reply-To: <20251015214848.11580-2-yunru.pan@amd.com>



On 10/16/2025 3:18 AM, Ellen Pan wrote:
>      1. Introduced amdgpu_virt_init_critical_region during VF init.
>       - VFs use init_data_header_offset and init_data_header_size_kb
>              transmitted via PF2VF mailbox to fetch the offset of
>              critical regions' offsets/sizes in VRAM and save to
>              adev->virt.crit_region_offsets and adev->virt.crit_region_sizes_kb.
> 
> Signed-off-by: Ellen Pan <yunru.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |   4 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c    | 165 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h    |  11 ++
>   drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  31 ++++
>   4 files changed, 211 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a99185ed0642..3ffb9bb1ec0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2782,6 +2782,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
>   		r = amdgpu_virt_request_full_gpu(adev, true);
>   		if (r)
>   			return r;
> +
> +		r = amdgpu_virt_init_critical_region(adev);
> +		if (r)
> +			return r;
>   	}
>   
>   	switch (adev->asic_type) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 56573fb27f63..805ecc69a8b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -44,6 +44,18 @@
>   		vf2pf_info->ucode_info[ucode].version = ver; \
>   	} while (0)
>   
> +#define mmRCC_CONFIG_MEMSIZE    0xde3
> +
> +const char *amdgpu_virt_dynamic_crit_table_name[] = {
> +	"IP DISCOVERY",
> +	"VBIOS IMG",
> +	"RAS TELEMETRY",
> +	"DATA EXCHANGE",
> +	"BAD PAGE INFO",
> +	"INIT HEADER",
> +	"LAST",
> +};
> +
>   bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev)
>   {
>   	/* By now all MMIO pages except mailbox are blocked */
> @@ -843,6 +855,159 @@ static void amdgpu_virt_init_ras(struct amdgpu_device *adev)
>   	adev->virt.ras.cper_rptr = 0;
>   }
>   
> +static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end)
> +{
> +	uint32_t sum = 0;
> +
> +	if (buf_start >= buf_end)
> +		return 0;
> +
> +	for (; buf_start < buf_end; buf_start++)
> +		sum += buf_start[0];
> +
> +	return 0xffffffff - sum;
> +}
> +
> +int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
> +{
> +	struct amd_sriov_msg_init_data_header *init_data_hdr = NULL;
> +	uint32_t init_hdr_offset = adev->virt.init_data_header.offset;
> +	uint32_t init_hdr_size = adev->virt.init_data_header.size_kb << 10;
> +	uint64_t vram_size;
> +	int r = 0;
> +	uint8_t checksum = 0;
> +
> +	/* Skip below init if critical region version != v2 */
> +	if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2)
> +		return 0;
> +
> +	if (init_hdr_offset < 0) {
> +		dev_err(adev->dev, "Invalid init header offset\n");
> +		return -EINVAL;
> +	}
> +
> +	vram_size = RREG32(mmRCC_CONFIG_MEMSIZE);
> +	if (!vram_size || vram_size == U32_MAX)
> +		return -EINVAL;
> +	vram_size <<= 20;
> +
> +	if ((init_hdr_offset + init_hdr_size) > vram_size) {
> +		dev_err(adev->dev, "init_data_header exceeds VRAM size, exiting\n");
> +		return -EINVAL;
> +	}
> +
> +	/* Allocate for init_data_hdr */
> +	init_data_hdr = kzalloc(sizeof(struct amd_sriov_msg_init_data_header), GFP_KERNEL);
> +	if (!init_data_hdr)
> +		return -ENOMEM;
> +
> +	amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t *)init_data_hdr,
> +					sizeof(struct amd_sriov_msg_init_data_header), false);
> +
> +	/* Table validation */
> +	if (strncmp(init_data_hdr->signature,
> +				AMDGPU_SRIOV_CRIT_DATA_SIGNATURE,
> +				AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) {
> +		dev_err(adev->dev, "Invalid init data signature: %.4s\n",
> +			init_data_hdr->signature);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	checksum = amdgpu_virt_crit_region_calc_checksum(
> +			(uint8_t *)&init_data_hdr->initdata_offset,
> +			(uint8_t *)init_data_hdr +
> +			sizeof(struct amd_sriov_msg_init_data_header));
> +	if (checksum != init_data_hdr->checksum) {
> +		dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x and init_data 0x%x\n",
> +				checksum, init_data_hdr->checksum);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn));
> +	memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl));
> +
> +	adev->virt.crit_regn.offset = init_data_hdr->initdata_offset;
> +	adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb;
> +
> +	/* Validation and initialization for each table entry */
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_IPD_TABLE_ID)) {
> +		if (init_data_hdr->ip_discovery_size_in_kb > DISCOVERY_TMR_SIZE) {
> +			dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
> +					init_data_hdr->ip_discovery_size_in_kb);
> +			r = -EINVAL;
> +			goto out;
> +		}
> +
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset =
> +			init_data_hdr->ip_discovery_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb =
> +			init_data_hdr->ip_discovery_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset =
> +			init_data_hdr->vbios_img_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb =
> +			init_data_hdr->vbios_img_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset =
> +			init_data_hdr->ras_tele_info_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb =
> +			init_data_hdr->ras_tele_info_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset =
> +			init_data_hdr->dataexchange_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb =
> +			init_data_hdr->dataexchange_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset =
> +			init_data_hdr->bad_page_info_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb =
> +			init_data_hdr->bad_page_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;

Could you confirm if this is really an error codndtion? For ex: I could 
see this scheme followed on SOCs which don't support RAS and thus ras 
telemetry/badpage table sections may not make sense. Same could be 
applicable for others (though most others don't look optional), but you 
may confirm which ones are mandatory vs optional.

Thanks,
Lijo

> +	}
> +
> +	adev->virt.is_dynamic_crit_regn_enabled = true;
> +
> +out:
> +	kfree(init_data_hdr);
> +	init_data_hdr = NULL;
> +
> +	return r;
> +}
> +
>   void amdgpu_virt_init(struct amdgpu_device *adev)
>   {
>   	bool is_sriov = false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 36247a160aa6..8d03a8620de9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -54,6 +54,12 @@
>   
>   #define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
>   
> +/* Signature used to validate the SR-IOV dynamic critical region init data header ("INDA") */
> +#define AMDGPU_SRIOV_CRIT_DATA_SIGNATURE "INDA"
> +#define AMDGPU_SRIOV_CRIT_DATA_SIG_LEN   4
> +
> +#define IS_SRIOV_CRIT_REGN_ENTRY_VALID(hdr, id) ((hdr)->valid_tables & (1 << (id)))
> +
>   enum amdgpu_sriov_vf_mode {
>   	SRIOV_VF_MODE_BARE_METAL = 0,
>   	SRIOV_VF_MODE_ONE_VF,
> @@ -296,6 +302,9 @@ struct amdgpu_virt {
>   
>   	/* dynamic(v2) critical regions */
>   	struct amdgpu_virt_region init_data_header;
> +	struct amdgpu_virt_region crit_regn;
> +	struct amdgpu_virt_region crit_regn_tbl[AMD_SRIOV_MSG_MAX_TABLE_ID];
> +	bool is_dynamic_crit_regn_enabled;
>   
>   	/* vf2pf message */
>   	struct delayed_work vf2pf_work;
> @@ -432,6 +441,8 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
>   void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
>   void amdgpu_virt_init(struct amdgpu_device *adev);
>   
> +int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
> +
>   bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
>   int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
>   void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> index 9228fd2c6dfd..1cee083fb6bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> @@ -71,6 +71,37 @@ enum amd_sriov_crit_region_version {
>   	GPU_CRIT_REGION_V2 = 2,
>   };
>   
> +/* v2 layout offset enum (in order of allocation) */
> +enum amd_sriov_msg_table_id_enum {
> +	AMD_SRIOV_MSG_IPD_TABLE_ID = 0,
> +	AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID,
> +	AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID,
> +	AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID,
> +	AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID,
> +	AMD_SRIOV_MSG_INITD_H_TABLE_ID,
> +	AMD_SRIOV_MSG_MAX_TABLE_ID,
> +};
> +
> +struct amd_sriov_msg_init_data_header {
> +	char     signature[4];  /* "INDA"  */
> +	uint32_t version;
> +	uint32_t checksum;
> +	uint32_t initdata_offset; /* 0 */
> +	uint32_t initdata_size_in_kb; /* 5MB */
> +	uint32_t valid_tables;
> +	uint32_t vbios_img_offset;
> +	uint32_t vbios_img_size_in_kb;
> +	uint32_t dataexchange_offset;
> +	uint32_t dataexchange_size_in_kb;
> +	uint32_t ras_tele_info_offset;
> +	uint32_t ras_tele_info_size_in_kb;
> +	uint32_t ip_discovery_offset;
> +	uint32_t ip_discovery_size_in_kb;
> +	uint32_t bad_page_info_offset;
> +	uint32_t bad_page_size_in_kb;
> +	uint32_t reserved[8];
> +};
> +
>   /*
>    * PF2VF history log:
>    * v1 defined in amdgim


  reply	other threads:[~2025-10-16 13:07 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-15 21:48 [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Ellen Pan
2025-10-15 21:48 ` [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init Ellen Pan
2025-10-16 13:06   ` Lazar, Lijo [this message]
2025-10-16 13:08   ` Alex Deucher
2025-10-15 21:48 ` [PATCH v5 4/6] drm/amdgpu: Reuse fw_vram_usage_* for dynamic critical region in SRIOV Ellen Pan
2025-10-16 12:59   ` Alex Deucher
2025-10-15 21:48 ` [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets Ellen Pan
2025-10-16 13:06   ` Alex Deucher
2025-10-16 14:06     ` Lazar, Lijo
2025-10-15 21:48 ` [PATCH v5 6/6] drm/amdgpu: Add logic for VF data exchange region " Ellen Pan
2025-10-16 13:08   ` Alex Deucher
2025-10-16 12:56 ` [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Alex Deucher

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=00e829e7-9826-49a6-93db-630abfbb6ed5@amd.com \
    --to=lijo.lazar@amd.com \
    --cc=Alexander.Deucher@amd.com \
    --cc=Christian.Koenig@amd.com \
    --cc=Jeffrey.Chan@amd.com \
    --cc=amd-gfx@lists.freedesktop.org \
    --cc=yunru.pan@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox