AMD-GFX Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix
@ 2025-10-15 21:48 Ellen Pan
  2025-10-15 21:48 ` [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init Ellen Pan
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Ellen Pan @ 2025-10-15 21:48 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian.Koenig, Lijo.Lazar, Jeffrey.Chan,
	Ellen Pan

 - This change prepares the later patches to intro  _v2 suffix to SRIOV critical regions

Signed-off-by: Ellen Pan <yunru.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c    | 20 ++++-----
 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 50 +++++++++++++++------
 2 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f96beb96c75c..8cd02eb605c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -686,7 +686,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
 		/* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/
 		adev->virt.fw_reserve.p_pf2vf =
 			(struct amd_sriov_msg_pf2vf_info_header *)
-			(adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10));
+			(adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
 
 		amdgpu_virt_read_pf2vf_data(adev);
 	}
@@ -703,21 +703,21 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev)
 		if (adev->mman.fw_vram_usage_va) {
 			adev->virt.fw_reserve.p_pf2vf =
 				(struct amd_sriov_msg_pf2vf_info_header *)
-				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10));
+				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
 			adev->virt.fw_reserve.p_vf2pf =
 				(struct amd_sriov_msg_vf2pf_info_header *)
-				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10));
+				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
 			adev->virt.fw_reserve.ras_telemetry =
-				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10));
+				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
 		} else if (adev->mman.drv_vram_usage_va) {
 			adev->virt.fw_reserve.p_pf2vf =
 				(struct amd_sriov_msg_pf2vf_info_header *)
-				(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10));
+				(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
 			adev->virt.fw_reserve.p_vf2pf =
 				(struct amd_sriov_msg_vf2pf_info_header *)
-				(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10));
+				(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
 			adev->virt.fw_reserve.ras_telemetry =
-				(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10));
+				(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
 		}
 
 		amdgpu_virt_read_pf2vf_data(adev);
@@ -1304,7 +1304,7 @@ static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev,
 	checksum = host_telemetry->header.checksum;
 	used_size = host_telemetry->header.used_size;
 
-	if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
+	if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10))
 		return 0;
 
 	tmp = kmemdup(&host_telemetry->body.error_count, used_size, GFP_KERNEL);
@@ -1383,7 +1383,7 @@ amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev,
 	checksum = host_telemetry->header.checksum;
 	used_size = host_telemetry->header.used_size;
 
-	if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
+	if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10))
 		return -EINVAL;
 
 	cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL);
@@ -1515,7 +1515,7 @@ static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev,
 	checksum = host_telemetry->header.checksum;
 	used_size = host_telemetry->header.used_size;
 
-	if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
+	if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10))
 		return 0;
 
 	tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, GFP_KERNEL);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
index 3a79ed7d8031..7509756b9ac5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -23,26 +23,48 @@
 #ifndef AMDGV_SRIOV_MSG__H_
 #define AMDGV_SRIOV_MSG__H_
 
-/* unit in kilobytes */
-#define AMD_SRIOV_MSG_VBIOS_OFFSET	     0
-#define AMD_SRIOV_MSG_VBIOS_SIZE_KB	     64
-#define AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB AMD_SRIOV_MSG_VBIOS_SIZE_KB
-#define AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB   4
-#define AMD_SRIOV_MSG_TMR_OFFSET_KB	     2048
-#define AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB	     2
-#define AMD_SRIOV_RAS_TELEMETRY_SIZE_KB	     64
+#define AMD_SRIOV_MSG_SIZE_KB                           1
+
 /*
- * layout
+ * layout v1
  * 0           64KB        65KB        66KB           68KB                   132KB
  * |   VBIOS   |   PF2VF   |   VF2PF   |   Bad Page   | RAS Telemetry Region | ...
  * |   64KB    |   1KB     |   1KB     |   2KB        | 64KB                 | ...
  */
 
-#define AMD_SRIOV_MSG_SIZE_KB                   1
-#define AMD_SRIOV_MSG_PF2VF_OFFSET_KB           AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB
-#define AMD_SRIOV_MSG_VF2PF_OFFSET_KB           (AMD_SRIOV_MSG_PF2VF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB)
-#define AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB        (AMD_SRIOV_MSG_VF2PF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB)
-#define AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB   (AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB + AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB)
+/*
+ * layout v2 (offsets are dynamically allocated and the offsets below are examples)
+ * 0           1KB         64KB        65KB        66KB           68KB                   132KB
+ * |  INITD_H  |   VBIOS   |   PF2VF   |   VF2PF   |   Bad Page   | RAS Telemetry Region | ...
+ * |   1KB     |   64KB    |   1KB     |   1KB     |   2KB        | 64KB                 | ...
+ *
+ * Note: PF2VF + VF2PF + Bad Page = DataExchange region (allocated contiguously)
+ */
+
+/* v1 layout sizes */
+#define AMD_SRIOV_MSG_VBIOS_SIZE_KB_V1			64
+#define AMD_SRIOV_MSG_PF2VF_SIZE_KB_V1			1
+#define AMD_SRIOV_MSG_VF2PF_SIZE_KB_V1			1
+#define AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB_V1		2
+#define AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1		64
+#define AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB_V1		\
+	(AMD_SRIOV_MSG_PF2VF_SIZE_KB_V1 + AMD_SRIOV_MSG_VF2PF_SIZE_KB_V1 + \
+	 AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB_V1)
+
+/* v1 offsets */
+#define AMD_SRIOV_MSG_VBIOS_OFFSET_V1			0
+#define AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB_V1		AMD_SRIOV_MSG_VBIOS_SIZE_KB_V1
+#define AMD_SRIOV_MSG_TMR_OFFSET_KB			2048
+#define AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1		AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB_V1
+#define AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1		\
+	(AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 + AMD_SRIOV_MSG_SIZE_KB)
+#define AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB_V1		\
+	(AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 + AMD_SRIOV_MSG_SIZE_KB)
+#define AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1	\
+	(AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB_V1 + AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB_V1)
+#define AMD_SRIOV_MSG_INIT_DATA_TOT_SIZE_KB_V1		\
+	(AMD_SRIOV_MSG_VBIOS_SIZE_KB_V1 + AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB_V1 + \
+	 AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1)
 
 /*
  * PF2VF history log:
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init
  2025-10-15 21:48 [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Ellen Pan
@ 2025-10-15 21:48 ` Ellen Pan
  2025-10-16 13:06   ` Lazar, Lijo
  2025-10-16 13:08   ` Alex Deucher
  2025-10-15 21:48 ` [PATCH v5 4/6] drm/amdgpu: Reuse fw_vram_usage_* for dynamic critical region in SRIOV Ellen Pan
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 12+ messages in thread
From: Ellen Pan @ 2025-10-15 21:48 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian.Koenig, Lijo.Lazar, Jeffrey.Chan,
	Ellen Pan

    1. Introduced amdgpu_virt_init_critical_region during VF init.
     - VFs use init_data_header_offset and init_data_header_size_kb
            transmitted via PF2VF mailbox to fetch the offset of
            critical regions' offsets/sizes in VRAM and save to
            adev->virt.crit_region_offsets and adev->virt.crit_region_sizes_kb.

Signed-off-by: Ellen Pan <yunru.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |   4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c    | 165 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h    |  11 ++
 drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  31 ++++
 4 files changed, 211 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a99185ed0642..3ffb9bb1ec0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2782,6 +2782,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
 		r = amdgpu_virt_request_full_gpu(adev, true);
 		if (r)
 			return r;
+
+		r = amdgpu_virt_init_critical_region(adev);
+		if (r)
+			return r;
 	}
 
 	switch (adev->asic_type) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 56573fb27f63..805ecc69a8b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -44,6 +44,18 @@
 		vf2pf_info->ucode_info[ucode].version = ver; \
 	} while (0)
 
+#define mmRCC_CONFIG_MEMSIZE    0xde3
+
+const char *amdgpu_virt_dynamic_crit_table_name[] = {
+	"IP DISCOVERY",
+	"VBIOS IMG",
+	"RAS TELEMETRY",
+	"DATA EXCHANGE",
+	"BAD PAGE INFO",
+	"INIT HEADER",
+	"LAST",
+};
+
 bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev)
 {
 	/* By now all MMIO pages except mailbox are blocked */
@@ -843,6 +855,159 @@ static void amdgpu_virt_init_ras(struct amdgpu_device *adev)
 	adev->virt.ras.cper_rptr = 0;
 }
 
+static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end)
+{
+	uint32_t sum = 0;
+
+	if (buf_start >= buf_end)
+		return 0;
+
+	for (; buf_start < buf_end; buf_start++)
+		sum += buf_start[0];
+
+	return 0xffffffff - sum;
+}
+
+int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
+{
+	struct amd_sriov_msg_init_data_header *init_data_hdr = NULL;
+	uint32_t init_hdr_offset = adev->virt.init_data_header.offset;
+	uint32_t init_hdr_size = adev->virt.init_data_header.size_kb << 10;
+	uint64_t vram_size;
+	int r = 0;
+	uint8_t checksum = 0;
+
+	/* Skip below init if critical region version != v2 */
+	if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2)
+		return 0;
+
+	if (init_hdr_offset < 0) {
+		dev_err(adev->dev, "Invalid init header offset\n");
+		return -EINVAL;
+	}
+
+	vram_size = RREG32(mmRCC_CONFIG_MEMSIZE);
+	if (!vram_size || vram_size == U32_MAX)
+		return -EINVAL;
+	vram_size <<= 20;
+
+	if ((init_hdr_offset + init_hdr_size) > vram_size) {
+		dev_err(adev->dev, "init_data_header exceeds VRAM size, exiting\n");
+		return -EINVAL;
+	}
+
+	/* Allocate for init_data_hdr */
+	init_data_hdr = kzalloc(sizeof(struct amd_sriov_msg_init_data_header), GFP_KERNEL);
+	if (!init_data_hdr)
+		return -ENOMEM;
+
+	amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t *)init_data_hdr,
+					sizeof(struct amd_sriov_msg_init_data_header), false);
+
+	/* Table validation */
+	if (strncmp(init_data_hdr->signature,
+				AMDGPU_SRIOV_CRIT_DATA_SIGNATURE,
+				AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) {
+		dev_err(adev->dev, "Invalid init data signature: %.4s\n",
+			init_data_hdr->signature);
+		r = -EINVAL;
+		goto out;
+	}
+
+	checksum = amdgpu_virt_crit_region_calc_checksum(
+			(uint8_t *)&init_data_hdr->initdata_offset,
+			(uint8_t *)init_data_hdr +
+			sizeof(struct amd_sriov_msg_init_data_header));
+	if (checksum != init_data_hdr->checksum) {
+		dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x and init_data 0x%x\n",
+				checksum, init_data_hdr->checksum);
+		r = -EINVAL;
+		goto out;
+	}
+
+	memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn));
+	memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl));
+
+	adev->virt.crit_regn.offset = init_data_hdr->initdata_offset;
+	adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb;
+
+	/* Validation and initialization for each table entry */
+	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_IPD_TABLE_ID)) {
+		if (init_data_hdr->ip_discovery_size_in_kb > DISCOVERY_TMR_SIZE) {
+			dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
+					init_data_hdr->ip_discovery_size_in_kb);
+			r = -EINVAL;
+			goto out;
+		}
+
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset =
+			init_data_hdr->ip_discovery_offset;
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb =
+			init_data_hdr->ip_discovery_size_in_kb;
+	} else {
+		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
+			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID]);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) {
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset =
+			init_data_hdr->vbios_img_offset;
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb =
+			init_data_hdr->vbios_img_size_in_kb;
+	} else {
+		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
+			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID]);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) {
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset =
+			init_data_hdr->ras_tele_info_offset;
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb =
+			init_data_hdr->ras_tele_info_size_in_kb;
+	} else {
+		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
+			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID]);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) {
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset =
+			init_data_hdr->dataexchange_offset;
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb =
+			init_data_hdr->dataexchange_size_in_kb;
+	} else {
+		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
+			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID]);
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) {
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset =
+			init_data_hdr->bad_page_info_offset;
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb =
+			init_data_hdr->bad_page_size_in_kb;
+	} else {
+		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
+			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID]);
+		r = -EINVAL;
+		goto out;
+	}
+
+	adev->virt.is_dynamic_crit_regn_enabled = true;
+
+out:
+	kfree(init_data_hdr);
+	init_data_hdr = NULL;
+
+	return r;
+}
+
 void amdgpu_virt_init(struct amdgpu_device *adev)
 {
 	bool is_sriov = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 36247a160aa6..8d03a8620de9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -54,6 +54,12 @@
 
 #define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
 
+/* Signature used to validate the SR-IOV dynamic critical region init data header ("INDA") */
+#define AMDGPU_SRIOV_CRIT_DATA_SIGNATURE "INDA"
+#define AMDGPU_SRIOV_CRIT_DATA_SIG_LEN   4
+
+#define IS_SRIOV_CRIT_REGN_ENTRY_VALID(hdr, id) ((hdr)->valid_tables & (1 << (id)))
+
 enum amdgpu_sriov_vf_mode {
 	SRIOV_VF_MODE_BARE_METAL = 0,
 	SRIOV_VF_MODE_ONE_VF,
@@ -296,6 +302,9 @@ struct amdgpu_virt {
 
 	/* dynamic(v2) critical regions */
 	struct amdgpu_virt_region init_data_header;
+	struct amdgpu_virt_region crit_regn;
+	struct amdgpu_virt_region crit_regn_tbl[AMD_SRIOV_MSG_MAX_TABLE_ID];
+	bool is_dynamic_crit_regn_enabled;
 
 	/* vf2pf message */
 	struct delayed_work vf2pf_work;
@@ -432,6 +441,8 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
 void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
 void amdgpu_virt_init(struct amdgpu_device *adev);
 
+int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
+
 bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
 int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
 void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
index 9228fd2c6dfd..1cee083fb6bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -71,6 +71,37 @@ enum amd_sriov_crit_region_version {
 	GPU_CRIT_REGION_V2 = 2,
 };
 
+/* v2 layout offset enum (in order of allocation) */
+enum amd_sriov_msg_table_id_enum {
+	AMD_SRIOV_MSG_IPD_TABLE_ID = 0,
+	AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID,
+	AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID,
+	AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID,
+	AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID,
+	AMD_SRIOV_MSG_INITD_H_TABLE_ID,
+	AMD_SRIOV_MSG_MAX_TABLE_ID,
+};
+
+struct amd_sriov_msg_init_data_header {
+	char     signature[4];  /* "INDA"  */
+	uint32_t version;
+	uint32_t checksum;
+	uint32_t initdata_offset; /* 0 */
+	uint32_t initdata_size_in_kb; /* 5MB */
+	uint32_t valid_tables;
+	uint32_t vbios_img_offset;
+	uint32_t vbios_img_size_in_kb;
+	uint32_t dataexchange_offset;
+	uint32_t dataexchange_size_in_kb;
+	uint32_t ras_tele_info_offset;
+	uint32_t ras_tele_info_size_in_kb;
+	uint32_t ip_discovery_offset;
+	uint32_t ip_discovery_size_in_kb;
+	uint32_t bad_page_info_offset;
+	uint32_t bad_page_size_in_kb;
+	uint32_t reserved[8];
+};
+
 /*
  * PF2VF history log:
  * v1 defined in amdgim
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v5 4/6] drm/amdgpu: Reuse fw_vram_usage_* for dynamic critical region in SRIOV
  2025-10-15 21:48 [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Ellen Pan
  2025-10-15 21:48 ` [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init Ellen Pan
@ 2025-10-15 21:48 ` Ellen Pan
  2025-10-16 12:59   ` Alex Deucher
  2025-10-15 21:48 ` [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets Ellen Pan
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Ellen Pan @ 2025-10-15 21:48 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian.Koenig, Lijo.Lazar, Jeffrey.Chan,
	Ellen Pan

- During guest driver init, asa VFs receive PF msg to
	init dynamic critical region(v2), VFs reuse fw_vram_usage_*
	 from ttm to store critical region tables in a 5MB chunk.

Signed-off-by: Ellen Pan <yunru.pan@amd.com>
---
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 29 ++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 10 +++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  9 ++++++
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index c7d32fb216e4..636385c80f64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -181,19 +181,22 @@ int amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev)
 	u8 frev, crev;
 	int usage_bytes = 0;
 
-	if (amdgpu_atom_parse_data_header(ctx, index, NULL, &frev, &crev, &data_offset)) {
-		if (frev == 2 && crev == 1) {
-			fw_usage_v2_1 =
-				(struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
-			amdgpu_atomfirmware_allocate_fb_v2_1(adev,
-					fw_usage_v2_1,
-					&usage_bytes);
-		} else if (frev >= 2 && crev >= 2) {
-			fw_usage_v2_2 =
-				(struct vram_usagebyfirmware_v2_2 *)(ctx->bios + data_offset);
-			amdgpu_atomfirmware_allocate_fb_v2_2(adev,
-					fw_usage_v2_2,
-					&usage_bytes);
+	/* Skip atomfirmware allocation for SRIOV VFs when dynamic crit regn is enabled */
+	if (!(amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled)) {
+		if (amdgpu_atom_parse_data_header(ctx, index, NULL, &frev, &crev, &data_offset)) {
+			if (frev == 2 && crev == 1) {
+				fw_usage_v2_1 =
+					(struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
+				amdgpu_atomfirmware_allocate_fb_v2_1(adev,
+						fw_usage_v2_1,
+						&usage_bytes);
+			} else if (frev >= 2 && crev >= 2) {
+				fw_usage_v2_2 =
+					(struct vram_usagebyfirmware_v2_2 *)(ctx->bios + data_offset);
+				amdgpu_atomfirmware_allocate_fb_v2_2(adev,
+						fw_usage_v2_2,
+						&usage_bytes);
+			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 7583da3d9ab0..e226c3aff7d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1939,17 +1939,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
 		return r;
 
 	/*
-	 *The reserved vram for driver must be pinned to the specified
-	 *place on the VRAM, so reserve it early.
+	 * The reserved VRAM for the driver must be pinned to a specific
+	 * location in VRAM, so reserve it early.
 	 */
 	r = amdgpu_ttm_drv_reserve_vram_init(adev);
 	if (r)
 		return r;
 
 	/*
-	 * only NAVI10 and onwards ASIC support for IP discovery.
-	 * If IP discovery enabled, a block of memory should be
-	 * reserved for IP discovey.
+	 * only NAVI10 and later ASICs support IP discovery.
+	 * If IP discovery is enabled, a block of memory should be
+	 * reserved for it.
 	 */
 	if (adev->discovery.reserve_tmr) {
 		r = amdgpu_ttm_reserve_tmr(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 805ecc69a8b5..12659990abe0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -999,6 +999,15 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
 		goto out;
 	}
 
+	/* reserved memory starts from crit region base offset with the size of 5MB */
+	adev->mman.fw_vram_usage_start_offset = adev->virt.crit_regn.offset;
+	adev->mman.fw_vram_usage_size = adev->virt.crit_regn.size_kb << 10;
+	dev_info(adev->dev,
+		"critical region v%d requested to reserve memory start at %08x with %d KB.\n",
+			init_data_hdr->version,
+			adev->mman.fw_vram_usage_start_offset,
+			adev->mman.fw_vram_usage_size >> 10);
+
 	adev->virt.is_dynamic_crit_regn_enabled = true;
 
 out:
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets
  2025-10-15 21:48 [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Ellen Pan
  2025-10-15 21:48 ` [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init Ellen Pan
  2025-10-15 21:48 ` [PATCH v5 4/6] drm/amdgpu: Reuse fw_vram_usage_* for dynamic critical region in SRIOV Ellen Pan
@ 2025-10-15 21:48 ` Ellen Pan
  2025-10-16 13:06   ` Alex Deucher
  2025-10-15 21:48 ` [PATCH v5 6/6] drm/amdgpu: Add logic for VF data exchange region " Ellen Pan
  2025-10-16 12:56 ` [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Alex Deucher
  4 siblings, 1 reply; 12+ messages in thread
From: Ellen Pan @ 2025-10-15 21:48 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian.Koenig, Lijo.Lazar, Jeffrey.Chan,
	Ellen Pan

1. Added VF logic in amdgpu_virt to init IP discovery using the offsets from dynamic(v2) critical regions;
2. Added VF logic in amdgpu_virt to init bios image using the offsets from dynamic(v2) critical regions;

Signed-off-by: Ellen Pan <yunru.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c      | 34 ++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 23 ++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 37 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |  2 +
 4 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
index 00e96419fcda..070fd61f8463 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
@@ -96,11 +96,12 @@ void amdgpu_bios_release(struct amdgpu_device *adev)
  * part of the system bios.  On boot, the system bios puts a
  * copy of the igp rom at the start of vram if a discrete card is
  * present.
- * For SR-IOV, the vbios image is also put in VRAM in the VF.
+ * For SR-IOV, if dynamic critical region is not enabled,
+ * the vbios image is also put at the start of VRAM in the VF.
  */
 static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev)
 {
-	uint8_t __iomem *bios;
+	uint8_t __iomem *bios = NULL;
 	resource_size_t vram_base;
 	resource_size_t size = 256 * 1024; /* ??? */
 
@@ -114,18 +115,33 @@ static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev)
 
 	adev->bios = NULL;
 	vram_base = pci_resource_start(adev->pdev, 0);
-	bios = ioremap_wc(vram_base, size);
-	if (!bios)
-		return false;
 
 	adev->bios = kmalloc(size, GFP_KERNEL);
-	if (!adev->bios) {
-		iounmap(bios);
+	if (!adev->bios)
 		return false;
+
+	/* For SRIOV with dynamic critical region is enabled,
+	 * the vbios image is put at a dynamic offset of VRAM in the VF.
+	 * If dynamic critical region is disabled, follow the existing logic as on baremetal.
+	 */
+	if (amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled) {
+		if (amdgpu_virt_get_dynamic_data_info(adev,
+				AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID, adev->bios, &size)) {
+			amdgpu_bios_release(adev);
+			return false;
+		}
+	} else {
+		bios = ioremap_wc(vram_base, size);
+		if (!bios) {
+			amdgpu_bios_release(adev);
+			return false;
+		}
+
+		memcpy_fromio(adev->bios, bios, size);
+		iounmap(bios);
 	}
+
 	adev->bios_size = size;
-	memcpy_fromio(adev->bios, bios, size);
-	iounmap(bios);
 
 	if (!check_atom_bios(adev, size)) {
 		amdgpu_bios_release(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 4e75334f3b3a..1809deb86797 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -303,14 +303,29 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
 	 * then it is not required to be reserved.
 	 */
 	if (sz_valid) {
-		uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
-		amdgpu_device_vram_access(adev, pos, (uint32_t *)binary,
-					  adev->discovery.size, false);
-		adev->discovery.reserve_tmr = true;
+		if (amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled) {
+			/* For SRIOV VFs with dynamic critical region enabled,
+			 * we will get the IPD binary via below call.
+			 * If dynamic critical is disabled, fall through to normal seq.
+			 */
+			if (amdgpu_virt_get_dynamic_data_info(adev,
+						AMD_SRIOV_MSG_IPD_TABLE_ID, binary,
+						(uint64_t *)&adev->mman.discovery_tmr_size)) {
+				ret = -EINVAL;
+				goto exit;
+			}
+		} else {
+			uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
+
+			amdgpu_device_vram_access(adev, pos, (uint32_t *)binary,
+					adev->discovery.size, false);
+			adev->discovery.reserve_tmr = true;
+		}
 	} else {
 		ret = amdgpu_discovery_read_binary_from_sysmem(adev, binary);
 	}
 
+exit:
 	if (ret)
 		dev_err(adev->dev,
 			"failed to read discovery info from memory, vram size read: %llx",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 12659990abe0..15157ed5df29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -999,6 +999,14 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
 		goto out;
 	}
 
+	/* Validation for critical region info */
+	if (adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb > DISCOVERY_TMR_SIZE) {
+		dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
+				adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb);
+		r = -EINVAL;
+		goto out;
+	}
+
 	/* reserved memory starts from crit region base offset with the size of 5MB */
 	adev->mman.fw_vram_usage_start_offset = adev->virt.crit_regn.offset;
 	adev->mman.fw_vram_usage_size = adev->virt.crit_regn.size_kb << 10;
@@ -1017,6 +1025,35 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
 	return r;
 }
 
+int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev,
+	int data_id, uint8_t *binary, uint64_t *size)
+{
+	uint32_t data_offset = 0;
+	uint32_t data_size = 0;
+	enum amd_sriov_msg_table_id_enum data_table_id = data_id;
+
+	if (data_table_id >= AMD_SRIOV_MSG_MAX_TABLE_ID)
+		return -EINVAL;
+
+	data_offset = adev->virt.crit_regn_tbl[data_table_id].offset;
+	data_size = adev->virt.crit_regn_tbl[data_table_id].size_kb << 10;
+
+	/* Validate on input params */
+	if (!binary || !size || *size < (uint64_t)data_size)
+		return -EINVAL;
+
+	/* Proceed to copy the dynamic content */
+	amdgpu_device_vram_access(adev,
+			(uint64_t)data_offset, (uint32_t *)binary, data_size, false);
+	*size = (uint64_t)data_size; // update the size as out param.
+
+	dev_dbg(adev->dev,
+		"Got %s info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n",
+		amdgpu_virt_dynamic_crit_table_name[data_id], data_offset, data_size);
+
+	return 0;
+}
+
 void amdgpu_virt_init(struct amdgpu_device *adev)
 {
 	bool is_sriov = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 8d03a8620de9..2a13cc892a13 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -442,6 +442,8 @@ void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
 void amdgpu_virt_init(struct amdgpu_device *adev);
 
 int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
+int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev,
+	int data_id, uint8_t *binary, uint64_t *size);
 
 bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
 int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v5 6/6] drm/amdgpu: Add logic for VF data exchange region to init from dynamic crit_region offsets
  2025-10-15 21:48 [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Ellen Pan
                   ` (2 preceding siblings ...)
  2025-10-15 21:48 ` [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets Ellen Pan
@ 2025-10-15 21:48 ` Ellen Pan
  2025-10-16 13:08   ` Alex Deucher
  2025-10-16 12:56 ` [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Alex Deucher
  4 siblings, 1 reply; 12+ messages in thread
From: Ellen Pan @ 2025-10-15 21:48 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alexander.Deucher, Christian.Koenig, Lijo.Lazar, Jeffrey.Chan,
	Ellen Pan

1. Added VF logic to init data exchange region using the offsets from dynamic(v2) critical regions;

Signed-off-by: Ellen Pan <yunru.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 104 ++++++++++++++++++-----
 1 file changed, 85 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 15157ed5df29..fe9a806dbf34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -218,12 +218,12 @@ int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev)
 				    &adev->virt.mm_table.gpu_addr,
 				    (void *)&adev->virt.mm_table.cpu_addr);
 	if (r) {
-		DRM_ERROR("failed to alloc mm table and error = %d.\n", r);
+		dev_err(adev->dev, "failed to alloc mm table and error = %d.\n", r);
 		return r;
 	}
 
 	memset((void *)adev->virt.mm_table.cpu_addr, 0, PAGE_SIZE);
-	DRM_INFO("MM table gpu addr = 0x%llx, cpu addr = %p.\n",
+	dev_info(adev->dev, "MM table gpu addr = 0x%llx, cpu addr = %p.\n",
 		 adev->virt.mm_table.gpu_addr,
 		 adev->virt.mm_table.cpu_addr);
 	return 0;
@@ -403,7 +403,9 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
 			if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
 							AMDGPU_GPU_PAGE_SIZE,
 							&bo, NULL))
-				DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
+				dev_dbg(adev->dev,
+						"RAS WARN: reserve vram for retired page %llx fail\n",
+						bp);
 			data->bps_bo[i] = bo;
 		}
 		data->last_reserved = i + 1;
@@ -671,10 +673,34 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
 	schedule_delayed_work(&(adev->virt.vf2pf_work), adev->virt.vf2pf_update_interval_ms);
 }
 
+static int amdgpu_virt_read_exchange_data_from_mem(struct amdgpu_device *adev, uint32_t *pfvf_data)
+{
+	uint32_t dataexchange_offset =
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset;
+	uint32_t dataexchange_size =
+		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10;
+	uint64_t pos = 0;
+
+	dev_info(adev->dev,
+			"Got data exchange info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n",
+			dataexchange_offset, dataexchange_size);
+
+	if (!IS_ALIGNED(dataexchange_offset, 4) || !IS_ALIGNED(dataexchange_size, 4)) {
+		dev_err(adev->dev, "Data exchange data not aligned to 4 bytes\n");
+		return -EINVAL;
+	}
+
+	pos = (uint64_t)dataexchange_offset;
+	amdgpu_device_vram_access(adev, pos, pfvf_data,
+			dataexchange_size, false);
+
+	return 0;
+}
+
 void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev)
 {
 	if (adev->virt.vf2pf_update_interval_ms != 0) {
-		DRM_INFO("clean up the vf2pf work item\n");
+		dev_info(adev->dev, "clean up the vf2pf work item\n");
 		cancel_delayed_work_sync(&adev->virt.vf2pf_work);
 		adev->virt.vf2pf_update_interval_ms = 0;
 	}
@@ -682,13 +708,15 @@ void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev)
 
 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
 {
+	uint32_t *pfvf_data = NULL;
+
 	adev->virt.fw_reserve.p_pf2vf = NULL;
 	adev->virt.fw_reserve.p_vf2pf = NULL;
 	adev->virt.vf2pf_update_interval_ms = 0;
 	adev->virt.vf2pf_update_retry_cnt = 0;
 
 	if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
-		DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
+		dev_warn(adev->dev, "Currently fw_vram and drv_vram should not have values at the same time!");
 	} else if (adev->mman.fw_vram_usage_va || adev->mman.drv_vram_usage_va) {
 		/* go through this logic in ip_init and reset to init workqueue*/
 		amdgpu_virt_exchange_data(adev);
@@ -697,11 +725,34 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
 		schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms));
 	} else if (adev->bios != NULL) {
 		/* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/
-		adev->virt.fw_reserve.p_pf2vf =
-			(struct amd_sriov_msg_pf2vf_info_header *)
-			(adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
+		if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) {
+			pfvf_data =
+				kzalloc(adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10,
+					GFP_KERNEL);
+			if (!pfvf_data) {
+				dev_err(adev->dev, "Failed to allocate memory for pfvf_data\n");
+				return;
+			}
 
-		amdgpu_virt_read_pf2vf_data(adev);
+			if (amdgpu_virt_read_exchange_data_from_mem(adev, pfvf_data))
+				goto free_pfvf_data;
+
+			adev->virt.fw_reserve.p_pf2vf =
+				(struct amd_sriov_msg_pf2vf_info_header *)pfvf_data;
+
+			amdgpu_virt_read_pf2vf_data(adev);
+
+free_pfvf_data:
+			kfree(pfvf_data);
+			pfvf_data = NULL;
+			adev->virt.fw_reserve.p_pf2vf = NULL;
+		} else {
+			adev->virt.fw_reserve.p_pf2vf =
+				(struct amd_sriov_msg_pf2vf_info_header *)
+				(adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
+
+			amdgpu_virt_read_pf2vf_data(adev);
+		}
 	}
 }
 
@@ -714,14 +765,29 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev)
 
 	if (adev->mman.fw_vram_usage_va || adev->mman.drv_vram_usage_va) {
 		if (adev->mman.fw_vram_usage_va) {
-			adev->virt.fw_reserve.p_pf2vf =
-				(struct amd_sriov_msg_pf2vf_info_header *)
-				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
-			adev->virt.fw_reserve.p_vf2pf =
-				(struct amd_sriov_msg_vf2pf_info_header *)
-				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
-			adev->virt.fw_reserve.ras_telemetry =
-				(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
+			if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) {
+				adev->virt.fw_reserve.p_pf2vf =
+					(struct amd_sriov_msg_pf2vf_info_header *)
+					(adev->mman.fw_vram_usage_va +
+					adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset);
+				adev->virt.fw_reserve.p_vf2pf =
+					(struct amd_sriov_msg_vf2pf_info_header *)
+					(adev->mman.fw_vram_usage_va +
+					adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset +
+					(AMD_SRIOV_MSG_SIZE_KB << 10));
+				adev->virt.fw_reserve.ras_telemetry =
+					(adev->mman.fw_vram_usage_va +
+					adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset);
+			} else {
+				adev->virt.fw_reserve.p_pf2vf =
+					(struct amd_sriov_msg_pf2vf_info_header *)
+					(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
+				adev->virt.fw_reserve.p_vf2pf =
+					(struct amd_sriov_msg_vf2pf_info_header *)
+					(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
+				adev->virt.fw_reserve.ras_telemetry =
+					(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
+			}
 		} else if (adev->mman.drv_vram_usage_va) {
 			adev->virt.fw_reserve.p_pf2vf =
 				(struct amd_sriov_msg_pf2vf_info_header *)
@@ -829,7 +895,7 @@ static bool amdgpu_virt_init_req_data(struct amdgpu_device *adev, u32 reg)
 			break;
 		default: /* other chip doesn't support SRIOV */
 			is_sriov = false;
-			DRM_ERROR("Unknown asic type: %d!\n", adev->asic_type);
+			dev_err(adev->dev, "Unknown asic type: %d!\n", adev->asic_type);
 			break;
 		}
 	}
@@ -1501,7 +1567,7 @@ amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block bloc
 	case AMDGPU_RAS_BLOCK__MPIO:
 		return RAS_TELEMETRY_GPU_BLOCK_MPIO;
 	default:
-		DRM_WARN_ONCE("Unsupported SRIOV RAS telemetry block 0x%x\n",
+		dev_warn(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n",
 			      block);
 		return RAS_TELEMETRY_GPU_BLOCK_COUNT;
 	}
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix
  2025-10-15 21:48 [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Ellen Pan
                   ` (3 preceding siblings ...)
  2025-10-15 21:48 ` [PATCH v5 6/6] drm/amdgpu: Add logic for VF data exchange region " Ellen Pan
@ 2025-10-16 12:56 ` Alex Deucher
  4 siblings, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2025-10-16 12:56 UTC (permalink / raw)
  To: Ellen Pan
  Cc: amd-gfx, Alexander.Deucher, Christian.Koenig, Lijo.Lazar,
	Jeffrey.Chan

On Wed, Oct 15, 2025 at 6:06 PM Ellen Pan <yunru.pan@amd.com> wrote:
>
>  - This change prepares the later patches to intro  _v2 suffix to SRIOV critical regions
>
> Signed-off-by: Ellen Pan <yunru.pan@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c    | 20 ++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 50 +++++++++++++++------
>  2 files changed, 46 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index f96beb96c75c..8cd02eb605c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -686,7 +686,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
>                 /* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/
>                 adev->virt.fw_reserve.p_pf2vf =
>                         (struct amd_sriov_msg_pf2vf_info_header *)
> -                       (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10));
> +                       (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
>
>                 amdgpu_virt_read_pf2vf_data(adev);
>         }
> @@ -703,21 +703,21 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev)
>                 if (adev->mman.fw_vram_usage_va) {
>                         adev->virt.fw_reserve.p_pf2vf =
>                                 (struct amd_sriov_msg_pf2vf_info_header *)
> -                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10));
> +                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
>                         adev->virt.fw_reserve.p_vf2pf =
>                                 (struct amd_sriov_msg_vf2pf_info_header *)
> -                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10));
> +                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
>                         adev->virt.fw_reserve.ras_telemetry =
> -                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10));
> +                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
>                 } else if (adev->mman.drv_vram_usage_va) {
>                         adev->virt.fw_reserve.p_pf2vf =
>                                 (struct amd_sriov_msg_pf2vf_info_header *)
> -                               (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10));
> +                               (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
>                         adev->virt.fw_reserve.p_vf2pf =
>                                 (struct amd_sriov_msg_vf2pf_info_header *)
> -                               (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10));
> +                               (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
>                         adev->virt.fw_reserve.ras_telemetry =
> -                               (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10));
> +                               (adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
>                 }
>
>                 amdgpu_virt_read_pf2vf_data(adev);
> @@ -1304,7 +1304,7 @@ static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev,
>         checksum = host_telemetry->header.checksum;
>         used_size = host_telemetry->header.used_size;
>
> -       if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
> +       if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10))
>                 return 0;
>
>         tmp = kmemdup(&host_telemetry->body.error_count, used_size, GFP_KERNEL);
> @@ -1383,7 +1383,7 @@ amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev,
>         checksum = host_telemetry->header.checksum;
>         used_size = host_telemetry->header.used_size;
>
> -       if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
> +       if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10))
>                 return -EINVAL;
>
>         cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL);
> @@ -1515,7 +1515,7 @@ static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev,
>         checksum = host_telemetry->header.checksum;
>         used_size = host_telemetry->header.used_size;
>
> -       if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
> +       if (used_size > (AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1 << 10))
>                 return 0;
>
>         tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, GFP_KERNEL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> index 3a79ed7d8031..7509756b9ac5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> @@ -23,26 +23,48 @@
>  #ifndef AMDGV_SRIOV_MSG__H_
>  #define AMDGV_SRIOV_MSG__H_
>
> -/* unit in kilobytes */
> -#define AMD_SRIOV_MSG_VBIOS_OFFSET          0
> -#define AMD_SRIOV_MSG_VBIOS_SIZE_KB         64
> -#define AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB AMD_SRIOV_MSG_VBIOS_SIZE_KB
> -#define AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB   4
> -#define AMD_SRIOV_MSG_TMR_OFFSET_KB         2048
> -#define AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB      2
> -#define AMD_SRIOV_RAS_TELEMETRY_SIZE_KB             64
> +#define AMD_SRIOV_MSG_SIZE_KB                           1
> +
>  /*
> - * layout
> + * layout v1
>   * 0           64KB        65KB        66KB           68KB                   132KB
>   * |   VBIOS   |   PF2VF   |   VF2PF   |   Bad Page   | RAS Telemetry Region | ...
>   * |   64KB    |   1KB     |   1KB     |   2KB        | 64KB                 | ...
>   */
>
> -#define AMD_SRIOV_MSG_SIZE_KB                   1
> -#define AMD_SRIOV_MSG_PF2VF_OFFSET_KB           AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB
> -#define AMD_SRIOV_MSG_VF2PF_OFFSET_KB           (AMD_SRIOV_MSG_PF2VF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB)
> -#define AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB        (AMD_SRIOV_MSG_VF2PF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB)
> -#define AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB   (AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB + AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB)
> +/*
> + * layout v2 (offsets are dynamically allocated and the offsets below are examples)
> + * 0           1KB         64KB        65KB        66KB           68KB                   132KB
> + * |  INITD_H  |   VBIOS   |   PF2VF   |   VF2PF   |   Bad Page   | RAS Telemetry Region | ...
> + * |   1KB     |   64KB    |   1KB     |   1KB     |   2KB        | 64KB                 | ...
> + *
> + * Note: PF2VF + VF2PF + Bad Page = DataExchange region (allocated contiguously)
> + */
> +
> +/* v1 layout sizes */
> +#define AMD_SRIOV_MSG_VBIOS_SIZE_KB_V1                 64
> +#define AMD_SRIOV_MSG_PF2VF_SIZE_KB_V1                 1
> +#define AMD_SRIOV_MSG_VF2PF_SIZE_KB_V1                 1
> +#define AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB_V1              2
> +#define AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1         64
> +#define AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB_V1          \
> +       (AMD_SRIOV_MSG_PF2VF_SIZE_KB_V1 + AMD_SRIOV_MSG_VF2PF_SIZE_KB_V1 + \
> +        AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB_V1)
> +
> +/* v1 offsets */
> +#define AMD_SRIOV_MSG_VBIOS_OFFSET_V1                  0
> +#define AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB_V1                AMD_SRIOV_MSG_VBIOS_SIZE_KB_V1
> +#define AMD_SRIOV_MSG_TMR_OFFSET_KB                    2048
> +#define AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1               AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB_V1
> +#define AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1               \
> +       (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 + AMD_SRIOV_MSG_SIZE_KB)
> +#define AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB_V1            \
> +       (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 + AMD_SRIOV_MSG_SIZE_KB)
> +#define AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1       \
> +       (AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB_V1 + AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB_V1)
> +#define AMD_SRIOV_MSG_INIT_DATA_TOT_SIZE_KB_V1         \
> +       (AMD_SRIOV_MSG_VBIOS_SIZE_KB_V1 + AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB_V1 + \
> +        AMD_SRIOV_MSG_RAS_TELEMETRY_SIZE_KB_V1)
>
>  /*
>   * PF2VF history log:
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 4/6] drm/amdgpu: Reuse fw_vram_usage_* for dynamic critical region in SRIOV
  2025-10-15 21:48 ` [PATCH v5 4/6] drm/amdgpu: Reuse fw_vram_usage_* for dynamic critical region in SRIOV Ellen Pan
@ 2025-10-16 12:59   ` Alex Deucher
  0 siblings, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2025-10-16 12:59 UTC (permalink / raw)
  To: Ellen Pan
  Cc: amd-gfx, Alexander.Deucher, Christian.Koenig, Lijo.Lazar,
	Jeffrey.Chan

On Wed, Oct 15, 2025 at 5:49 PM Ellen Pan <yunru.pan@amd.com> wrote:
>
> - During guest driver init, asa VFs receive PF msg to
>         init dynamic critical region(v2), VFs reuse fw_vram_usage_*
>          from ttm to store critical region tables in a 5MB chunk.
>
> Signed-off-by: Ellen Pan <yunru.pan@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 29 ++++++++++---------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 10 +++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      |  9 ++++++
>  3 files changed, 30 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> index c7d32fb216e4..636385c80f64 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
> @@ -181,19 +181,22 @@ int amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev)
>         u8 frev, crev;
>         int usage_bytes = 0;
>
> -       if (amdgpu_atom_parse_data_header(ctx, index, NULL, &frev, &crev, &data_offset)) {
> -               if (frev == 2 && crev == 1) {
> -                       fw_usage_v2_1 =
> -                               (struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
> -                       amdgpu_atomfirmware_allocate_fb_v2_1(adev,
> -                                       fw_usage_v2_1,
> -                                       &usage_bytes);
> -               } else if (frev >= 2 && crev >= 2) {
> -                       fw_usage_v2_2 =
> -                               (struct vram_usagebyfirmware_v2_2 *)(ctx->bios + data_offset);
> -                       amdgpu_atomfirmware_allocate_fb_v2_2(adev,
> -                                       fw_usage_v2_2,
> -                                       &usage_bytes);
> +       /* Skip atomfirmware allocation for SRIOV VFs when dynamic crit regn is enabled */
> +       if (!(amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled)) {
> +               if (amdgpu_atom_parse_data_header(ctx, index, NULL, &frev, &crev, &data_offset)) {
> +                       if (frev == 2 && crev == 1) {
> +                               fw_usage_v2_1 =
> +                                       (struct vram_usagebyfirmware_v2_1 *)(ctx->bios + data_offset);
> +                               amdgpu_atomfirmware_allocate_fb_v2_1(adev,
> +                                               fw_usage_v2_1,
> +                                               &usage_bytes);
> +                       } else if (frev >= 2 && crev >= 2) {
> +                               fw_usage_v2_2 =
> +                                       (struct vram_usagebyfirmware_v2_2 *)(ctx->bios + data_offset);
> +                               amdgpu_atomfirmware_allocate_fb_v2_2(adev,
> +                                               fw_usage_v2_2,
> +                                               &usage_bytes);
> +                       }
>                 }
>         }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 7583da3d9ab0..e226c3aff7d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1939,17 +1939,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>                 return r;
>
>         /*
> -        *The reserved vram for driver must be pinned to the specified
> -        *place on the VRAM, so reserve it early.
> +        * The reserved VRAM for the driver must be pinned to a specific
> +        * location in VRAM, so reserve it early.
>          */
>         r = amdgpu_ttm_drv_reserve_vram_init(adev);
>         if (r)
>                 return r;
>
>         /*
> -        * only NAVI10 and onwards ASIC support for IP discovery.
> -        * If IP discovery enabled, a block of memory should be
> -        * reserved for IP discovey.
> +        * only NAVI10 and later ASICs support IP discovery.
> +        * If IP discovery is enabled, a block of memory should be
> +        * reserved for it.
>          */
>         if (adev->discovery.reserve_tmr) {
>                 r = amdgpu_ttm_reserve_tmr(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 805ecc69a8b5..12659990abe0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -999,6 +999,15 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
>                 goto out;
>         }
>
> +       /* reserved memory starts from crit region base offset with the size of 5MB */
> +       adev->mman.fw_vram_usage_start_offset = adev->virt.crit_regn.offset;
> +       adev->mman.fw_vram_usage_size = adev->virt.crit_regn.size_kb << 10;
> +       dev_info(adev->dev,
> +               "critical region v%d requested to reserve memory start at %08x with %d KB.\n",
> +                       init_data_hdr->version,
> +                       adev->mman.fw_vram_usage_start_offset,
> +                       adev->mman.fw_vram_usage_size >> 10);
> +
>         adev->virt.is_dynamic_crit_regn_enabled = true;
>
>  out:
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets
  2025-10-15 21:48 ` [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets Ellen Pan
@ 2025-10-16 13:06   ` Alex Deucher
  2025-10-16 14:06     ` Lazar, Lijo
  0 siblings, 1 reply; 12+ messages in thread
From: Alex Deucher @ 2025-10-16 13:06 UTC (permalink / raw)
  To: Ellen Pan
  Cc: amd-gfx, Alexander.Deucher, Christian.Koenig, Lijo.Lazar,
	Jeffrey.Chan

On Wed, Oct 15, 2025 at 5:56 PM Ellen Pan <yunru.pan@amd.com> wrote:
>
> 1. Added VF logic in amdgpu_virt to init IP discovery using the offsets from dynamic(v2) critical regions;
> 2. Added VF logic in amdgpu_virt to init bios image using the offsets from dynamic(v2) critical regions;
>
> Signed-off-by: Ellen Pan <yunru.pan@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c      | 34 ++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 23 ++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 37 +++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |  2 +
>  4 files changed, 83 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
> index 00e96419fcda..070fd61f8463 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
> @@ -96,11 +96,12 @@ void amdgpu_bios_release(struct amdgpu_device *adev)
>   * part of the system bios.  On boot, the system bios puts a
>   * copy of the igp rom at the start of vram if a discrete card is
>   * present.
> - * For SR-IOV, the vbios image is also put in VRAM in the VF.
> + * For SR-IOV, if dynamic critical region is not enabled,
> + * the vbios image is also put at the start of VRAM in the VF.
>   */
>  static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev)
>  {
> -       uint8_t __iomem *bios;
> +       uint8_t __iomem *bios = NULL;
>         resource_size_t vram_base;
>         resource_size_t size = 256 * 1024; /* ??? */
>
> @@ -114,18 +115,33 @@ static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev)
>
>         adev->bios = NULL;
>         vram_base = pci_resource_start(adev->pdev, 0);
> -       bios = ioremap_wc(vram_base, size);
> -       if (!bios)
> -               return false;
>
>         adev->bios = kmalloc(size, GFP_KERNEL);
> -       if (!adev->bios) {
> -               iounmap(bios);
> +       if (!adev->bios)
>                 return false;
> +
> +       /* For SRIOV with dynamic critical region is enabled,
> +        * the vbios image is put at a dynamic offset of VRAM in the VF.
> +        * If dynamic critical region is disabled, follow the existing logic as on baremetal.
> +        */
> +       if (amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled) {
> +               if (amdgpu_virt_get_dynamic_data_info(adev,
> +                               AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID, adev->bios, &size)) {
> +                       amdgpu_bios_release(adev);
> +                       return false;
> +               }
> +       } else {
> +               bios = ioremap_wc(vram_base, size);
> +               if (!bios) {
> +                       amdgpu_bios_release(adev);
> +                       return false;
> +               }
> +
> +               memcpy_fromio(adev->bios, bios, size);
> +               iounmap(bios);
>         }
> +
>         adev->bios_size = size;
> -       memcpy_fromio(adev->bios, bios, size);
> -       iounmap(bios);
>
>         if (!check_atom_bios(adev, size)) {
>                 amdgpu_bios_release(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> index 4e75334f3b3a..1809deb86797 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> @@ -303,14 +303,29 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
>          * then it is not required to be reserved.
>          */
>         if (sz_valid) {
> -               uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
> -               amdgpu_device_vram_access(adev, pos, (uint32_t *)binary,
> -                                         adev->discovery.size, false);
> -               adev->discovery.reserve_tmr = true;
> +               if (amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled) {
> +                       /* For SRIOV VFs with dynamic critical region enabled,
> +                        * we will get the IPD binary via below call.
> +                        * If dynamic critical is disabled, fall through to normal seq.
> +                        */
> +                       if (amdgpu_virt_get_dynamic_data_info(adev,
> +                                               AMD_SRIOV_MSG_IPD_TABLE_ID, binary,
> +                                               (uint64_t *)&adev->mman.discovery_tmr_size)) {

I think this is adev->discovery.size now after Lijo's latest changes.
@Lazar, Lijo I think we can remove adev->mman.discovery_tmr_size.
It's no longer used by anything.  With that fixed,
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> +                               ret = -EINVAL;
> +                               goto exit;
> +                       }
> +               } else {
> +                       uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
> +
> +                       amdgpu_device_vram_access(adev, pos, (uint32_t *)binary,
> +                                       adev->discovery.size, false);
> +                       adev->discovery.reserve_tmr = true;
> +               }
>         } else {
>                 ret = amdgpu_discovery_read_binary_from_sysmem(adev, binary);
>         }
>
> +exit:
>         if (ret)
>                 dev_err(adev->dev,
>                         "failed to read discovery info from memory, vram size read: %llx",
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 12659990abe0..15157ed5df29 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -999,6 +999,14 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
>                 goto out;
>         }
>
> +       /* Validation for critical region info */
> +       if (adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb > DISCOVERY_TMR_SIZE) {
> +               dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
> +                               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
>         /* reserved memory starts from crit region base offset with the size of 5MB */
>         adev->mman.fw_vram_usage_start_offset = adev->virt.crit_regn.offset;
>         adev->mman.fw_vram_usage_size = adev->virt.crit_regn.size_kb << 10;
> @@ -1017,6 +1025,35 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
>         return r;
>  }
>
> +int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev,
> +       int data_id, uint8_t *binary, uint64_t *size)
> +{
> +       uint32_t data_offset = 0;
> +       uint32_t data_size = 0;
> +       enum amd_sriov_msg_table_id_enum data_table_id = data_id;
> +
> +       if (data_table_id >= AMD_SRIOV_MSG_MAX_TABLE_ID)
> +               return -EINVAL;
> +
> +       data_offset = adev->virt.crit_regn_tbl[data_table_id].offset;
> +       data_size = adev->virt.crit_regn_tbl[data_table_id].size_kb << 10;
> +
> +       /* Validate on input params */
> +       if (!binary || !size || *size < (uint64_t)data_size)
> +               return -EINVAL;
> +
> +       /* Proceed to copy the dynamic content */
> +       amdgpu_device_vram_access(adev,
> +                       (uint64_t)data_offset, (uint32_t *)binary, data_size, false);
> +       *size = (uint64_t)data_size; // update the size as out param.
> +
> +       dev_dbg(adev->dev,
> +               "Got %s info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n",
> +               amdgpu_virt_dynamic_crit_table_name[data_id], data_offset, data_size);
> +
> +       return 0;
> +}
> +
>  void amdgpu_virt_init(struct amdgpu_device *adev)
>  {
>         bool is_sriov = false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 8d03a8620de9..2a13cc892a13 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -442,6 +442,8 @@ void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
>  void amdgpu_virt_init(struct amdgpu_device *adev);
>
>  int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
> +int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev,
> +       int data_id, uint8_t *binary, uint64_t *size);
>
>  bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
>  int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init
  2025-10-15 21:48 ` [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init Ellen Pan
@ 2025-10-16 13:06   ` Lazar, Lijo
  2025-10-16 13:08   ` Alex Deucher
  1 sibling, 0 replies; 12+ messages in thread
From: Lazar, Lijo @ 2025-10-16 13:06 UTC (permalink / raw)
  To: Ellen Pan, amd-gfx; +Cc: Alexander.Deucher, Christian.Koenig, Jeffrey.Chan



On 10/16/2025 3:18 AM, Ellen Pan wrote:
>      1. Introduced amdgpu_virt_init_critical_region during VF init.
>       - VFs use init_data_header_offset and init_data_header_size_kb
>              transmitted via PF2VF mailbox to fetch the offset of
>              critical regions' offsets/sizes in VRAM and save to
>              adev->virt.crit_region_offsets and adev->virt.crit_region_sizes_kb.
> 
> Signed-off-by: Ellen Pan <yunru.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |   4 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c    | 165 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h    |  11 ++
>   drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  31 ++++
>   4 files changed, 211 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a99185ed0642..3ffb9bb1ec0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2782,6 +2782,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
>   		r = amdgpu_virt_request_full_gpu(adev, true);
>   		if (r)
>   			return r;
> +
> +		r = amdgpu_virt_init_critical_region(adev);
> +		if (r)
> +			return r;
>   	}
>   
>   	switch (adev->asic_type) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 56573fb27f63..805ecc69a8b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -44,6 +44,18 @@
>   		vf2pf_info->ucode_info[ucode].version = ver; \
>   	} while (0)
>   
> +#define mmRCC_CONFIG_MEMSIZE    0xde3
> +
> +const char *amdgpu_virt_dynamic_crit_table_name[] = {
> +	"IP DISCOVERY",
> +	"VBIOS IMG",
> +	"RAS TELEMETRY",
> +	"DATA EXCHANGE",
> +	"BAD PAGE INFO",
> +	"INIT HEADER",
> +	"LAST",
> +};
> +
>   bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev)
>   {
>   	/* By now all MMIO pages except mailbox are blocked */
> @@ -843,6 +855,159 @@ static void amdgpu_virt_init_ras(struct amdgpu_device *adev)
>   	adev->virt.ras.cper_rptr = 0;
>   }
>   
> +static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end)
> +{
> +	uint32_t sum = 0;
> +
> +	if (buf_start >= buf_end)
> +		return 0;
> +
> +	for (; buf_start < buf_end; buf_start++)
> +		sum += buf_start[0];
> +
> +	return 0xffffffff - sum;
> +}
> +
> +int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
> +{
> +	struct amd_sriov_msg_init_data_header *init_data_hdr = NULL;
> +	uint32_t init_hdr_offset = adev->virt.init_data_header.offset;
> +	uint32_t init_hdr_size = adev->virt.init_data_header.size_kb << 10;
> +	uint64_t vram_size;
> +	int r = 0;
> +	uint8_t checksum = 0;
> +
> +	/* Skip below init if critical region version != v2 */
> +	if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2)
> +		return 0;
> +
> +	if (init_hdr_offset < 0) {
> +		dev_err(adev->dev, "Invalid init header offset\n");
> +		return -EINVAL;
> +	}
> +
> +	vram_size = RREG32(mmRCC_CONFIG_MEMSIZE);
> +	if (!vram_size || vram_size == U32_MAX)
> +		return -EINVAL;
> +	vram_size <<= 20;
> +
> +	if ((init_hdr_offset + init_hdr_size) > vram_size) {
> +		dev_err(adev->dev, "init_data_header exceeds VRAM size, exiting\n");
> +		return -EINVAL;
> +	}
> +
> +	/* Allocate for init_data_hdr */
> +	init_data_hdr = kzalloc(sizeof(struct amd_sriov_msg_init_data_header), GFP_KERNEL);
> +	if (!init_data_hdr)
> +		return -ENOMEM;
> +
> +	amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t *)init_data_hdr,
> +					sizeof(struct amd_sriov_msg_init_data_header), false);
> +
> +	/* Table validation */
> +	if (strncmp(init_data_hdr->signature,
> +				AMDGPU_SRIOV_CRIT_DATA_SIGNATURE,
> +				AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) {
> +		dev_err(adev->dev, "Invalid init data signature: %.4s\n",
> +			init_data_hdr->signature);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	checksum = amdgpu_virt_crit_region_calc_checksum(
> +			(uint8_t *)&init_data_hdr->initdata_offset,
> +			(uint8_t *)init_data_hdr +
> +			sizeof(struct amd_sriov_msg_init_data_header));
> +	if (checksum != init_data_hdr->checksum) {
> +		dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x and init_data 0x%x\n",
> +				checksum, init_data_hdr->checksum);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn));
> +	memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl));
> +
> +	adev->virt.crit_regn.offset = init_data_hdr->initdata_offset;
> +	adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb;
> +
> +	/* Validation and initialization for each table entry */
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_IPD_TABLE_ID)) {
> +		if (init_data_hdr->ip_discovery_size_in_kb > DISCOVERY_TMR_SIZE) {
> +			dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
> +					init_data_hdr->ip_discovery_size_in_kb);
> +			r = -EINVAL;
> +			goto out;
> +		}
> +
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset =
> +			init_data_hdr->ip_discovery_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb =
> +			init_data_hdr->ip_discovery_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset =
> +			init_data_hdr->vbios_img_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb =
> +			init_data_hdr->vbios_img_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset =
> +			init_data_hdr->ras_tele_info_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb =
> +			init_data_hdr->ras_tele_info_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset =
> +			init_data_hdr->dataexchange_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb =
> +			init_data_hdr->dataexchange_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) {
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset =
> +			init_data_hdr->bad_page_info_offset;
> +		adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb =
> +			init_data_hdr->bad_page_size_in_kb;
> +	} else {
> +		dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +			amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID]);
> +		r = -EINVAL;
> +		goto out;

Could you confirm if this is really an error codndtion? For ex: I could 
see this scheme followed on SOCs which don't support RAS and thus ras 
telemetry/badpage table sections may not make sense. Same could be 
applicable for others (though most others don't look optional), but you 
may confirm which ones are mandatory vs optional.

Thanks,
Lijo

> +	}
> +
> +	adev->virt.is_dynamic_crit_regn_enabled = true;
> +
> +out:
> +	kfree(init_data_hdr);
> +	init_data_hdr = NULL;
> +
> +	return r;
> +}
> +
>   void amdgpu_virt_init(struct amdgpu_device *adev)
>   {
>   	bool is_sriov = false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 36247a160aa6..8d03a8620de9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -54,6 +54,12 @@
>   
>   #define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
>   
> +/* Signature used to validate the SR-IOV dynamic critical region init data header ("INDA") */
> +#define AMDGPU_SRIOV_CRIT_DATA_SIGNATURE "INDA"
> +#define AMDGPU_SRIOV_CRIT_DATA_SIG_LEN   4
> +
> +#define IS_SRIOV_CRIT_REGN_ENTRY_VALID(hdr, id) ((hdr)->valid_tables & (1 << (id)))
> +
>   enum amdgpu_sriov_vf_mode {
>   	SRIOV_VF_MODE_BARE_METAL = 0,
>   	SRIOV_VF_MODE_ONE_VF,
> @@ -296,6 +302,9 @@ struct amdgpu_virt {
>   
>   	/* dynamic(v2) critical regions */
>   	struct amdgpu_virt_region init_data_header;
> +	struct amdgpu_virt_region crit_regn;
> +	struct amdgpu_virt_region crit_regn_tbl[AMD_SRIOV_MSG_MAX_TABLE_ID];
> +	bool is_dynamic_crit_regn_enabled;
>   
>   	/* vf2pf message */
>   	struct delayed_work vf2pf_work;
> @@ -432,6 +441,8 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
>   void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
>   void amdgpu_virt_init(struct amdgpu_device *adev);
>   
> +int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
> +
>   bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
>   int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
>   void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> index 9228fd2c6dfd..1cee083fb6bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> @@ -71,6 +71,37 @@ enum amd_sriov_crit_region_version {
>   	GPU_CRIT_REGION_V2 = 2,
>   };
>   
> +/* v2 layout offset enum (in order of allocation) */
> +enum amd_sriov_msg_table_id_enum {
> +	AMD_SRIOV_MSG_IPD_TABLE_ID = 0,
> +	AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID,
> +	AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID,
> +	AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID,
> +	AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID,
> +	AMD_SRIOV_MSG_INITD_H_TABLE_ID,
> +	AMD_SRIOV_MSG_MAX_TABLE_ID,
> +};
> +
> +struct amd_sriov_msg_init_data_header {
> +	char     signature[4];  /* "INDA"  */
> +	uint32_t version;
> +	uint32_t checksum;
> +	uint32_t initdata_offset; /* 0 */
> +	uint32_t initdata_size_in_kb; /* 5MB */
> +	uint32_t valid_tables;
> +	uint32_t vbios_img_offset;
> +	uint32_t vbios_img_size_in_kb;
> +	uint32_t dataexchange_offset;
> +	uint32_t dataexchange_size_in_kb;
> +	uint32_t ras_tele_info_offset;
> +	uint32_t ras_tele_info_size_in_kb;
> +	uint32_t ip_discovery_offset;
> +	uint32_t ip_discovery_size_in_kb;
> +	uint32_t bad_page_info_offset;
> +	uint32_t bad_page_size_in_kb;
> +	uint32_t reserved[8];
> +};
> +
>   /*
>    * PF2VF history log:
>    * v1 defined in amdgim


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 6/6] drm/amdgpu: Add logic for VF data exchange region to init from dynamic crit_region offsets
  2025-10-15 21:48 ` [PATCH v5 6/6] drm/amdgpu: Add logic for VF data exchange region " Ellen Pan
@ 2025-10-16 13:08   ` Alex Deucher
  0 siblings, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2025-10-16 13:08 UTC (permalink / raw)
  To: Ellen Pan
  Cc: amd-gfx, Alexander.Deucher, Christian.Koenig, Lijo.Lazar,
	Jeffrey.Chan

On Wed, Oct 15, 2025 at 5:49 PM Ellen Pan <yunru.pan@amd.com> wrote:
>
> 1. Added VF logic to init data exchange region using the offsets from dynamic(v2) critical regions;
>
> Signed-off-by: Ellen Pan <yunru.pan@amd.com>

Acked-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 104 ++++++++++++++++++-----
>  1 file changed, 85 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 15157ed5df29..fe9a806dbf34 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -218,12 +218,12 @@ int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev)
>                                     &adev->virt.mm_table.gpu_addr,
>                                     (void *)&adev->virt.mm_table.cpu_addr);
>         if (r) {
> -               DRM_ERROR("failed to alloc mm table and error = %d.\n", r);
> +               dev_err(adev->dev, "failed to alloc mm table and error = %d.\n", r);
>                 return r;
>         }
>
>         memset((void *)adev->virt.mm_table.cpu_addr, 0, PAGE_SIZE);
> -       DRM_INFO("MM table gpu addr = 0x%llx, cpu addr = %p.\n",
> +       dev_info(adev->dev, "MM table gpu addr = 0x%llx, cpu addr = %p.\n",
>                  adev->virt.mm_table.gpu_addr,
>                  adev->virt.mm_table.cpu_addr);
>         return 0;
> @@ -403,7 +403,9 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
>                         if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
>                                                         AMDGPU_GPU_PAGE_SIZE,
>                                                         &bo, NULL))
> -                               DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp);
> +                               dev_dbg(adev->dev,
> +                                               "RAS WARN: reserve vram for retired page %llx fail\n",
> +                                               bp);
>                         data->bps_bo[i] = bo;
>                 }
>                 data->last_reserved = i + 1;
> @@ -671,10 +673,34 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
>         schedule_delayed_work(&(adev->virt.vf2pf_work), adev->virt.vf2pf_update_interval_ms);
>  }
>
> +static int amdgpu_virt_read_exchange_data_from_mem(struct amdgpu_device *adev, uint32_t *pfvf_data)
> +{
> +       uint32_t dataexchange_offset =
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset;
> +       uint32_t dataexchange_size =
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10;
> +       uint64_t pos = 0;
> +
> +       dev_info(adev->dev,
> +                       "Got data exchange info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n",
> +                       dataexchange_offset, dataexchange_size);
> +
> +       if (!IS_ALIGNED(dataexchange_offset, 4) || !IS_ALIGNED(dataexchange_size, 4)) {
> +               dev_err(adev->dev, "Data exchange data not aligned to 4 bytes\n");
> +               return -EINVAL;
> +       }
> +
> +       pos = (uint64_t)dataexchange_offset;
> +       amdgpu_device_vram_access(adev, pos, pfvf_data,
> +                       dataexchange_size, false);
> +
> +       return 0;
> +}
> +
>  void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev)
>  {
>         if (adev->virt.vf2pf_update_interval_ms != 0) {
> -               DRM_INFO("clean up the vf2pf work item\n");
> +               dev_info(adev->dev, "clean up the vf2pf work item\n");
>                 cancel_delayed_work_sync(&adev->virt.vf2pf_work);
>                 adev->virt.vf2pf_update_interval_ms = 0;
>         }
> @@ -682,13 +708,15 @@ void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev)
>
>  void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
>  {
> +       uint32_t *pfvf_data = NULL;
> +
>         adev->virt.fw_reserve.p_pf2vf = NULL;
>         adev->virt.fw_reserve.p_vf2pf = NULL;
>         adev->virt.vf2pf_update_interval_ms = 0;
>         adev->virt.vf2pf_update_retry_cnt = 0;
>
>         if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
> -               DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
> +               dev_warn(adev->dev, "Currently fw_vram and drv_vram should not have values at the same time!");
>         } else if (adev->mman.fw_vram_usage_va || adev->mman.drv_vram_usage_va) {
>                 /* go through this logic in ip_init and reset to init workqueue*/
>                 amdgpu_virt_exchange_data(adev);
> @@ -697,11 +725,34 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
>                 schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms));
>         } else if (adev->bios != NULL) {
>                 /* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/
> -               adev->virt.fw_reserve.p_pf2vf =
> -                       (struct amd_sriov_msg_pf2vf_info_header *)
> -                       (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
> +               if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) {
> +                       pfvf_data =
> +                               kzalloc(adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb << 10,
> +                                       GFP_KERNEL);
> +                       if (!pfvf_data) {
> +                               dev_err(adev->dev, "Failed to allocate memory for pfvf_data\n");
> +                               return;
> +                       }
>
> -               amdgpu_virt_read_pf2vf_data(adev);
> +                       if (amdgpu_virt_read_exchange_data_from_mem(adev, pfvf_data))
> +                               goto free_pfvf_data;
> +
> +                       adev->virt.fw_reserve.p_pf2vf =
> +                               (struct amd_sriov_msg_pf2vf_info_header *)pfvf_data;
> +
> +                       amdgpu_virt_read_pf2vf_data(adev);
> +
> +free_pfvf_data:
> +                       kfree(pfvf_data);
> +                       pfvf_data = NULL;
> +                       adev->virt.fw_reserve.p_pf2vf = NULL;
> +               } else {
> +                       adev->virt.fw_reserve.p_pf2vf =
> +                               (struct amd_sriov_msg_pf2vf_info_header *)
> +                               (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
> +
> +                       amdgpu_virt_read_pf2vf_data(adev);
> +               }
>         }
>  }
>
> @@ -714,14 +765,29 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev)
>
>         if (adev->mman.fw_vram_usage_va || adev->mman.drv_vram_usage_va) {
>                 if (adev->mman.fw_vram_usage_va) {
> -                       adev->virt.fw_reserve.p_pf2vf =
> -                               (struct amd_sriov_msg_pf2vf_info_header *)
> -                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
> -                       adev->virt.fw_reserve.p_vf2pf =
> -                               (struct amd_sriov_msg_vf2pf_info_header *)
> -                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
> -                       adev->virt.fw_reserve.ras_telemetry =
> -                               (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
> +                       if (adev->virt.req_init_data_ver == GPU_CRIT_REGION_V2) {
> +                               adev->virt.fw_reserve.p_pf2vf =
> +                                       (struct amd_sriov_msg_pf2vf_info_header *)
> +                                       (adev->mman.fw_vram_usage_va +
> +                                       adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset);
> +                               adev->virt.fw_reserve.p_vf2pf =
> +                                       (struct amd_sriov_msg_vf2pf_info_header *)
> +                                       (adev->mman.fw_vram_usage_va +
> +                                       adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset +
> +                                       (AMD_SRIOV_MSG_SIZE_KB << 10));
> +                               adev->virt.fw_reserve.ras_telemetry =
> +                                       (adev->mman.fw_vram_usage_va +
> +                                       adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset);
> +                       } else {
> +                               adev->virt.fw_reserve.p_pf2vf =
> +                                       (struct amd_sriov_msg_pf2vf_info_header *)
> +                                       (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB_V1 << 10));
> +                               adev->virt.fw_reserve.p_vf2pf =
> +                                       (struct amd_sriov_msg_vf2pf_info_header *)
> +                                       (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB_V1 << 10));
> +                               adev->virt.fw_reserve.ras_telemetry =
> +                                       (adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB_V1 << 10));
> +                       }
>                 } else if (adev->mman.drv_vram_usage_va) {
>                         adev->virt.fw_reserve.p_pf2vf =
>                                 (struct amd_sriov_msg_pf2vf_info_header *)
> @@ -829,7 +895,7 @@ static bool amdgpu_virt_init_req_data(struct amdgpu_device *adev, u32 reg)
>                         break;
>                 default: /* other chip doesn't support SRIOV */
>                         is_sriov = false;
> -                       DRM_ERROR("Unknown asic type: %d!\n", adev->asic_type);
> +                       dev_err(adev->dev, "Unknown asic type: %d!\n", adev->asic_type);
>                         break;
>                 }
>         }
> @@ -1501,7 +1567,7 @@ amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block bloc
>         case AMDGPU_RAS_BLOCK__MPIO:
>                 return RAS_TELEMETRY_GPU_BLOCK_MPIO;
>         default:
> -               DRM_WARN_ONCE("Unsupported SRIOV RAS telemetry block 0x%x\n",
> +               dev_warn(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n",
>                               block);
>                 return RAS_TELEMETRY_GPU_BLOCK_COUNT;
>         }
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init
  2025-10-15 21:48 ` [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init Ellen Pan
  2025-10-16 13:06   ` Lazar, Lijo
@ 2025-10-16 13:08   ` Alex Deucher
  1 sibling, 0 replies; 12+ messages in thread
From: Alex Deucher @ 2025-10-16 13:08 UTC (permalink / raw)
  To: Ellen Pan
  Cc: amd-gfx, Alexander.Deucher, Christian.Koenig, Lijo.Lazar,
	Jeffrey.Chan

On Wed, Oct 15, 2025 at 5:49 PM Ellen Pan <yunru.pan@amd.com> wrote:
>
>     1. Introduced amdgpu_virt_init_critical_region during VF init.
>      - VFs use init_data_header_offset and init_data_header_size_kb
>             transmitted via PF2VF mailbox to fetch the offset of
>             critical regions' offsets/sizes in VRAM and save to
>             adev->virt.crit_region_offsets and adev->virt.crit_region_sizes_kb.
>
> Signed-off-by: Ellen Pan <yunru.pan@amd.com>

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |   4 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c    | 165 ++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h    |  11 ++
>  drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h |  31 ++++
>  4 files changed, 211 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a99185ed0642..3ffb9bb1ec0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2782,6 +2782,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
>                 r = amdgpu_virt_request_full_gpu(adev, true);
>                 if (r)
>                         return r;
> +
> +               r = amdgpu_virt_init_critical_region(adev);
> +               if (r)
> +                       return r;
>         }
>
>         switch (adev->asic_type) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 56573fb27f63..805ecc69a8b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -44,6 +44,18 @@
>                 vf2pf_info->ucode_info[ucode].version = ver; \
>         } while (0)
>
> +#define mmRCC_CONFIG_MEMSIZE    0xde3
> +
> +const char *amdgpu_virt_dynamic_crit_table_name[] = {
> +       "IP DISCOVERY",
> +       "VBIOS IMG",
> +       "RAS TELEMETRY",
> +       "DATA EXCHANGE",
> +       "BAD PAGE INFO",
> +       "INIT HEADER",
> +       "LAST",
> +};
> +
>  bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev)
>  {
>         /* By now all MMIO pages except mailbox are blocked */
> @@ -843,6 +855,159 @@ static void amdgpu_virt_init_ras(struct amdgpu_device *adev)
>         adev->virt.ras.cper_rptr = 0;
>  }
>
> +static uint8_t amdgpu_virt_crit_region_calc_checksum(uint8_t *buf_start, uint8_t *buf_end)
> +{
> +       uint32_t sum = 0;
> +
> +       if (buf_start >= buf_end)
> +               return 0;
> +
> +       for (; buf_start < buf_end; buf_start++)
> +               sum += buf_start[0];
> +
> +       return 0xffffffff - sum;
> +}
> +
> +int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
> +{
> +       struct amd_sriov_msg_init_data_header *init_data_hdr = NULL;
> +       uint32_t init_hdr_offset = adev->virt.init_data_header.offset;
> +       uint32_t init_hdr_size = adev->virt.init_data_header.size_kb << 10;
> +       uint64_t vram_size;
> +       int r = 0;
> +       uint8_t checksum = 0;
> +
> +       /* Skip below init if critical region version != v2 */
> +       if (adev->virt.req_init_data_ver != GPU_CRIT_REGION_V2)
> +               return 0;
> +
> +       if (init_hdr_offset < 0) {
> +               dev_err(adev->dev, "Invalid init header offset\n");
> +               return -EINVAL;
> +       }
> +
> +       vram_size = RREG32(mmRCC_CONFIG_MEMSIZE);
> +       if (!vram_size || vram_size == U32_MAX)
> +               return -EINVAL;
> +       vram_size <<= 20;
> +
> +       if ((init_hdr_offset + init_hdr_size) > vram_size) {
> +               dev_err(adev->dev, "init_data_header exceeds VRAM size, exiting\n");
> +               return -EINVAL;
> +       }
> +
> +       /* Allocate for init_data_hdr */
> +       init_data_hdr = kzalloc(sizeof(struct amd_sriov_msg_init_data_header), GFP_KERNEL);
> +       if (!init_data_hdr)
> +               return -ENOMEM;
> +
> +       amdgpu_device_vram_access(adev, (uint64_t)init_hdr_offset, (uint32_t *)init_data_hdr,
> +                                       sizeof(struct amd_sriov_msg_init_data_header), false);
> +
> +       /* Table validation */
> +       if (strncmp(init_data_hdr->signature,
> +                               AMDGPU_SRIOV_CRIT_DATA_SIGNATURE,
> +                               AMDGPU_SRIOV_CRIT_DATA_SIG_LEN) != 0) {
> +               dev_err(adev->dev, "Invalid init data signature: %.4s\n",
> +                       init_data_hdr->signature);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
> +       checksum = amdgpu_virt_crit_region_calc_checksum(
> +                       (uint8_t *)&init_data_hdr->initdata_offset,
> +                       (uint8_t *)init_data_hdr +
> +                       sizeof(struct amd_sriov_msg_init_data_header));
> +       if (checksum != init_data_hdr->checksum) {
> +               dev_err(adev->dev, "Found unmatching checksum from calculation 0x%x and init_data 0x%x\n",
> +                               checksum, init_data_hdr->checksum);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
> +       memset(&adev->virt.crit_regn, 0, sizeof(adev->virt.crit_regn));
> +       memset(adev->virt.crit_regn_tbl, 0, sizeof(adev->virt.crit_regn_tbl));
> +
> +       adev->virt.crit_regn.offset = init_data_hdr->initdata_offset;
> +       adev->virt.crit_regn.size_kb = init_data_hdr->initdata_size_in_kb;
> +
> +       /* Validation and initialization for each table entry */
> +       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_IPD_TABLE_ID)) {
> +               if (init_data_hdr->ip_discovery_size_in_kb > DISCOVERY_TMR_SIZE) {
> +                       dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
> +                                       init_data_hdr->ip_discovery_size_in_kb);
> +                       r = -EINVAL;
> +                       goto out;
> +               }
> +
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].offset =
> +                       init_data_hdr->ip_discovery_offset;
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb =
> +                       init_data_hdr->ip_discovery_size_in_kb;
> +       } else {
> +               dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +                       amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_IPD_TABLE_ID]);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID)) {
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].offset =
> +                       init_data_hdr->vbios_img_offset;
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID].size_kb =
> +                       init_data_hdr->vbios_img_size_in_kb;
> +       } else {
> +               dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +                       amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID]);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID)) {
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].offset =
> +                       init_data_hdr->ras_tele_info_offset;
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID].size_kb =
> +                       init_data_hdr->ras_tele_info_size_in_kb;
> +       } else {
> +               dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +                       amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID]);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID)) {
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].offset =
> +                       init_data_hdr->dataexchange_offset;
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID].size_kb =
> +                       init_data_hdr->dataexchange_size_in_kb;
> +       } else {
> +               dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +                       amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID]);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (IS_SRIOV_CRIT_REGN_ENTRY_VALID(init_data_hdr, AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID)) {
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].offset =
> +                       init_data_hdr->bad_page_info_offset;
> +               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID].size_kb =
> +                       init_data_hdr->bad_page_size_in_kb;
> +       } else {
> +               dev_err(adev->dev, "Missing dynamic %s info from critical region v2.\n",
> +                       amdgpu_virt_dynamic_crit_table_name[AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID]);
> +               r = -EINVAL;
> +               goto out;
> +       }
> +
> +       adev->virt.is_dynamic_crit_regn_enabled = true;
> +
> +out:
> +       kfree(init_data_hdr);
> +       init_data_hdr = NULL;
> +
> +       return r;
> +}
> +
>  void amdgpu_virt_init(struct amdgpu_device *adev)
>  {
>         bool is_sriov = false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 36247a160aa6..8d03a8620de9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -54,6 +54,12 @@
>
>  #define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2
>
> +/* Signature used to validate the SR-IOV dynamic critical region init data header ("INDA") */
> +#define AMDGPU_SRIOV_CRIT_DATA_SIGNATURE "INDA"
> +#define AMDGPU_SRIOV_CRIT_DATA_SIG_LEN   4
> +
> +#define IS_SRIOV_CRIT_REGN_ENTRY_VALID(hdr, id) ((hdr)->valid_tables & (1 << (id)))
> +
>  enum amdgpu_sriov_vf_mode {
>         SRIOV_VF_MODE_BARE_METAL = 0,
>         SRIOV_VF_MODE_ONE_VF,
> @@ -296,6 +302,9 @@ struct amdgpu_virt {
>
>         /* dynamic(v2) critical regions */
>         struct amdgpu_virt_region init_data_header;
> +       struct amdgpu_virt_region crit_regn;
> +       struct amdgpu_virt_region crit_regn_tbl[AMD_SRIOV_MSG_MAX_TABLE_ID];
> +       bool is_dynamic_crit_regn_enabled;
>
>         /* vf2pf message */
>         struct delayed_work vf2pf_work;
> @@ -432,6 +441,8 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev);
>  void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
>  void amdgpu_virt_init(struct amdgpu_device *adev);
>
> +int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
> +
>  bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
>  int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
>  void amdgpu_virt_disable_access_debugfs(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> index 9228fd2c6dfd..1cee083fb6bd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
> @@ -71,6 +71,37 @@ enum amd_sriov_crit_region_version {
>         GPU_CRIT_REGION_V2 = 2,
>  };
>
> +/* v2 layout offset enum (in order of allocation) */
> +enum amd_sriov_msg_table_id_enum {
> +       AMD_SRIOV_MSG_IPD_TABLE_ID = 0,
> +       AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID,
> +       AMD_SRIOV_MSG_RAS_TELEMETRY_TABLE_ID,
> +       AMD_SRIOV_MSG_DATAEXCHANGE_TABLE_ID,
> +       AMD_SRIOV_MSG_BAD_PAGE_INFO_TABLE_ID,
> +       AMD_SRIOV_MSG_INITD_H_TABLE_ID,
> +       AMD_SRIOV_MSG_MAX_TABLE_ID,
> +};
> +
> +struct amd_sriov_msg_init_data_header {
> +       char     signature[4];  /* "INDA"  */
> +       uint32_t version;
> +       uint32_t checksum;
> +       uint32_t initdata_offset; /* 0 */
> +       uint32_t initdata_size_in_kb; /* 5MB */
> +       uint32_t valid_tables;
> +       uint32_t vbios_img_offset;
> +       uint32_t vbios_img_size_in_kb;
> +       uint32_t dataexchange_offset;
> +       uint32_t dataexchange_size_in_kb;
> +       uint32_t ras_tele_info_offset;
> +       uint32_t ras_tele_info_size_in_kb;
> +       uint32_t ip_discovery_offset;
> +       uint32_t ip_discovery_size_in_kb;
> +       uint32_t bad_page_info_offset;
> +       uint32_t bad_page_size_in_kb;
> +       uint32_t reserved[8];
> +};
> +
>  /*
>   * PF2VF history log:
>   * v1 defined in amdgim
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets
  2025-10-16 13:06   ` Alex Deucher
@ 2025-10-16 14:06     ` Lazar, Lijo
  0 siblings, 0 replies; 12+ messages in thread
From: Lazar, Lijo @ 2025-10-16 14:06 UTC (permalink / raw)
  To: Alex Deucher, Ellen Pan
  Cc: amd-gfx, Alexander.Deucher, Christian.Koenig, Jeffrey.Chan



On 10/16/2025 6:36 PM, Alex Deucher wrote:
> On Wed, Oct 15, 2025 at 5:56 PM Ellen Pan <yunru.pan@amd.com> wrote:
>>
>> 1. Added VF logic in amdgpu_virt to init IP discovery using the offsets from dynamic(v2) critical regions;
>> 2. Added VF logic in amdgpu_virt to init bios image using the offsets from dynamic(v2) critical regions;
>>
>> Signed-off-by: Ellen Pan <yunru.pan@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c      | 34 ++++++++++++-----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 23 ++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c      | 37 +++++++++++++++++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h      |  2 +
>>   4 files changed, 83 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
>> index 00e96419fcda..070fd61f8463 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
>> @@ -96,11 +96,12 @@ void amdgpu_bios_release(struct amdgpu_device *adev)
>>    * part of the system bios.  On boot, the system bios puts a
>>    * copy of the igp rom at the start of vram if a discrete card is
>>    * present.
>> - * For SR-IOV, the vbios image is also put in VRAM in the VF.
>> + * For SR-IOV, if dynamic critical region is not enabled,
>> + * the vbios image is also put at the start of VRAM in the VF.
>>    */
>>   static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev)
>>   {
>> -       uint8_t __iomem *bios;
>> +       uint8_t __iomem *bios = NULL;
>>          resource_size_t vram_base;
>>          resource_size_t size = 256 * 1024; /* ??? */
>>
>> @@ -114,18 +115,33 @@ static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev)
>>
>>          adev->bios = NULL;
>>          vram_base = pci_resource_start(adev->pdev, 0);
>> -       bios = ioremap_wc(vram_base, size);
>> -       if (!bios)
>> -               return false;
>>
>>          adev->bios = kmalloc(size, GFP_KERNEL);
>> -       if (!adev->bios) {
>> -               iounmap(bios);
>> +       if (!adev->bios)
>>                  return false;
>> +
>> +       /* For SRIOV with dynamic critical region is enabled,
>> +        * the vbios image is put at a dynamic offset of VRAM in the VF.
>> +        * If dynamic critical region is disabled, follow the existing logic as on baremetal.
>> +        */
>> +       if (amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled) {
>> +               if (amdgpu_virt_get_dynamic_data_info(adev,
>> +                               AMD_SRIOV_MSG_VBIOS_IMG_TABLE_ID, adev->bios, &size)) {
>> +                       amdgpu_bios_release(adev);
>> +                       return false;
>> +               }
>> +       } else {
>> +               bios = ioremap_wc(vram_base, size);
>> +               if (!bios) {
>> +                       amdgpu_bios_release(adev);
>> +                       return false;
>> +               }
>> +
>> +               memcpy_fromio(adev->bios, bios, size);
>> +               iounmap(bios);
>>          }
>> +
>>          adev->bios_size = size;
>> -       memcpy_fromio(adev->bios, bios, size);
>> -       iounmap(bios);
>>
>>          if (!check_atom_bios(adev, size)) {
>>                  amdgpu_bios_release(adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>> index 4e75334f3b3a..1809deb86797 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>> @@ -303,14 +303,29 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,
>>           * then it is not required to be reserved.
>>           */
>>          if (sz_valid) {
>> -               uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
>> -               amdgpu_device_vram_access(adev, pos, (uint32_t *)binary,
>> -                                         adev->discovery.size, false);
>> -               adev->discovery.reserve_tmr = true;
>> +               if (amdgpu_sriov_vf(adev) && adev->virt.is_dynamic_crit_regn_enabled) {
>> +                       /* For SRIOV VFs with dynamic critical region enabled,
>> +                        * we will get the IPD binary via below call.
>> +                        * If dynamic critical is disabled, fall through to normal seq.
>> +                        */
>> +                       if (amdgpu_virt_get_dynamic_data_info(adev,
>> +                                               AMD_SRIOV_MSG_IPD_TABLE_ID, binary,
>> +                                               (uint64_t *)&adev->mman.discovery_tmr_size)) {
> 
> I think this is adev->discovery.size now after Lijo's latest changes.
> @Lazar, Lijo I think we can remove adev->mman.discovery_tmr_size.
> It's no longer used by anything.  With that fixed,
> Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

Will do. Other than that, and one comment below, this patch looks good 
to me as well.
	>
>> +                               ret = -EINVAL;
>> +                               goto exit;

The message printed at exit: label is not valid for this case. It's 
better to have another error message for this case.

Thanks,
Lijo
>> +                       }
>> +               } else {
>> +                       uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET;
>> +
>> +                       amdgpu_device_vram_access(adev, pos, (uint32_t *)binary,
>> +                                       adev->discovery.size, false);
>> +                       adev->discovery.reserve_tmr = true;
>> +               }
>>          } else {
>>                  ret = amdgpu_discovery_read_binary_from_sysmem(adev, binary);
>>          }
>>
>> +exit:
>>          if (ret)
>>                  dev_err(adev->dev,
>>                          "failed to read discovery info from memory, vram size read: %llx",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> index 12659990abe0..15157ed5df29 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>> @@ -999,6 +999,14 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
>>                  goto out;
>>          }
>>
>> +       /* Validation for critical region info */
>> +       if (adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb > DISCOVERY_TMR_SIZE) {
>> +               dev_err(adev->dev, "Invalid IP discovery size: 0x%x\n",
>> +                               adev->virt.crit_regn_tbl[AMD_SRIOV_MSG_IPD_TABLE_ID].size_kb);
>> +               r = -EINVAL;
>> +               goto out;
>> +       }
>> +
>>          /* reserved memory starts from crit region base offset with the size of 5MB */
>>          adev->mman.fw_vram_usage_start_offset = adev->virt.crit_regn.offset;
>>          adev->mman.fw_vram_usage_size = adev->virt.crit_regn.size_kb << 10;
>> @@ -1017,6 +1025,35 @@ int amdgpu_virt_init_critical_region(struct amdgpu_device *adev)
>>          return r;
>>   }
>>
>> +int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev,
>> +       int data_id, uint8_t *binary, uint64_t *size)
>> +{
>> +       uint32_t data_offset = 0;
>> +       uint32_t data_size = 0;
>> +       enum amd_sriov_msg_table_id_enum data_table_id = data_id;
>> +
>> +       if (data_table_id >= AMD_SRIOV_MSG_MAX_TABLE_ID)
>> +               return -EINVAL;
>> +
>> +       data_offset = adev->virt.crit_regn_tbl[data_table_id].offset;
>> +       data_size = adev->virt.crit_regn_tbl[data_table_id].size_kb << 10;
>> +
>> +       /* Validate on input params */
>> +       if (!binary || !size || *size < (uint64_t)data_size)
>> +               return -EINVAL;
>> +
>> +       /* Proceed to copy the dynamic content */
>> +       amdgpu_device_vram_access(adev,
>> +                       (uint64_t)data_offset, (uint32_t *)binary, data_size, false);
>> +       *size = (uint64_t)data_size; // update the size as out param.
>> +
>> +       dev_dbg(adev->dev,
>> +               "Got %s info from dynamic crit_region_table at offset 0x%x with size of 0x%x bytes.\n",
>> +               amdgpu_virt_dynamic_crit_table_name[data_id], data_offset, data_size);
>> +
>> +       return 0;
>> +}
>> +
>>   void amdgpu_virt_init(struct amdgpu_device *adev)
>>   {
>>          bool is_sriov = false;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> index 8d03a8620de9..2a13cc892a13 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>> @@ -442,6 +442,8 @@ void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev);
>>   void amdgpu_virt_init(struct amdgpu_device *adev);
>>
>>   int amdgpu_virt_init_critical_region(struct amdgpu_device *adev);
>> +int amdgpu_virt_get_dynamic_data_info(struct amdgpu_device *adev,
>> +       int data_id, uint8_t *binary, uint64_t *size);
>>
>>   bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev);
>>   int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev);
>> --
>> 2.34.1
>>


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2025-10-16 14:06 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-15 21:48 [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Ellen Pan
2025-10-15 21:48 ` [PATCH v5 3/6] drm/amdgpu: Introduce SRIOV critical regions v2 during VF init Ellen Pan
2025-10-16 13:06   ` Lazar, Lijo
2025-10-16 13:08   ` Alex Deucher
2025-10-15 21:48 ` [PATCH v5 4/6] drm/amdgpu: Reuse fw_vram_usage_* for dynamic critical region in SRIOV Ellen Pan
2025-10-16 12:59   ` Alex Deucher
2025-10-15 21:48 ` [PATCH v5 5/6] drm/amdgpu: Add logic for VF ipd and VF bios to init from dynamic crit_region offsets Ellen Pan
2025-10-16 13:06   ` Alex Deucher
2025-10-16 14:06     ` Lazar, Lijo
2025-10-15 21:48 ` [PATCH v5 6/6] drm/amdgpu: Add logic for VF data exchange region " Ellen Pan
2025-10-16 13:08   ` Alex Deucher
2025-10-16 12:56 ` [PATCH v5 1/6] drm/amdgpu: Updated naming of SRIOV critical region offsets/sizes with _V1 suffix Alex Deucher

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox