[RFC 11/14] vfio/nvgrace-egm: Fetch EGM region retired pages list

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

From: <ankita@nvidia.com>
To: <ankita@nvidia.com>, <jgg@nvidia.com>,
	<alex.williamson@redhat.com>, <yishaih@nvidia.com>,
	<skolothumtho@nvidia.com>, <kevin.tian@intel.com>,
	<yi.l.liu@intel.com>, <zhiw@nvidia.com>
Cc: <aniketa@nvidia.com>, <cjia@nvidia.com>, <kwankhede@nvidia.com>,
	<targupta@nvidia.com>, <vsethi@nvidia.com>, <acurrid@nvidia.com>,
	<apopple@nvidia.com>, <jhubbard@nvidia.com>, <danw@nvidia.com>,
	<anuaggarwal@nvidia.com>, <mochs@nvidia.com>, <kjaju@nvidia.com>,
	<dnigam@nvidia.com>, <kvm@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>
Subject: [RFC 11/14] vfio/nvgrace-egm: Fetch EGM region retired pages list
Date: Thu, 4 Sep 2025 04:08:25 +0000	[thread overview]
Message-ID: <20250904040828.319452-12-ankita@nvidia.com> (raw)
In-Reply-To: <20250904040828.319452-1-ankita@nvidia.com>

From: Ankit Agrawal <ankita@nvidia.com>

It is possible for some system memory pages on the EGM to
have retired pages with uncorrectable ECC errors. A list of
pages known with such errors (referred as retired pages) are
maintained by the Host UEFI. The Host UEFI populates such list
in a reserved region. It communicates the SPA of this region
through a ACPI DSDT property.

nvgrace-egm module is responsible to store the list of retired page
offsets to be made available for usermode processes. The module:
1. Get the reserved memory region SPA and maps to it to fetch
the list of bad pages.
2. Calculate the retired page offsets in the EGM and stores it.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/egm.c     | 81 ++++++++++++++++++++++++++
 drivers/vfio/pci/nvgrace-gpu/egm_dev.c | 32 ++++++++--
 drivers/vfio/pci/nvgrace-gpu/egm_dev.h |  5 +-
 drivers/vfio/pci/nvgrace-gpu/main.c    |  8 ++-
 include/linux/nvgrace-egm.h            |  2 +
 5 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index bf1241ed1d60..7a026b4d98f7 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -8,6 +8,11 @@
 
 #define MAX_EGM_NODES 4
 
+struct h_node {
+	unsigned long mem_offset;
+	struct hlist_node node;
+};
+
 static dev_t dev;
 static struct class *class;
 static DEFINE_XARRAY(egm_chardevs);
@@ -16,6 +21,7 @@ struct chardev {
 	struct device device;
 	struct cdev cdev;
 	atomic_t open_count;
+	DECLARE_HASHTABLE(htbl, 0x10);
 };
 
 static struct nvgrace_egm_dev *
@@ -145,20 +151,86 @@ static void del_egm_chardev(struct chardev *egm_chardev)
 	put_device(&egm_chardev->device);
 }
 
+static void cleanup_retired_pages(struct chardev *egm_chardev)
+{
+	struct h_node *cur_page;
+	unsigned long bkt;
+	struct hlist_node *temp_node;
+
+	hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
+		hash_del(&cur_page->node);
+		kvfree(cur_page);
+	}
+}
+
+static int nvgrace_egm_fetch_retired_pages(struct nvgrace_egm_dev *egm_dev,
+					   struct chardev *egm_chardev)
+{
+	u64 count;
+	void *memaddr;
+	int index, ret = 0;
+
+	memaddr = memremap(egm_dev->retiredpagesphys, PAGE_SIZE, MEMREMAP_WB);
+	if (!memaddr)
+		return -ENOMEM;
+
+	count = *(u64 *)memaddr;
+
+	for (index = 0; index < count; index++) {
+		struct h_node *retired_page;
+
+		/*
+		 * Since the EGM is linearly mapped, the offset in the
+		 * carveout is the same offset in the VM system memory.
+		 *
+		 * Calculate the offset to communicate to the usermode
+		 * apps.
+		 */
+		retired_page = kvzalloc(sizeof(*retired_page), GFP_KERNEL);
+		if (!retired_page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		retired_page->mem_offset = *((u64 *)memaddr + index + 1) -
+					   egm_dev->egmphys;
+		hash_add(egm_chardev->htbl, &retired_page->node,
+			 retired_page->mem_offset);
+	}
+
+	memunmap(memaddr);
+
+	if (ret)
+		cleanup_retired_pages(egm_chardev);
+
+	return ret;
+}
+
 static int egm_driver_probe(struct auxiliary_device *aux_dev,
 			    const struct auxiliary_device_id *id)
 {
 	struct nvgrace_egm_dev *egm_dev =
 		container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
 	struct chardev *egm_chardev;
+	int ret;
 
 	egm_chardev = setup_egm_chardev(egm_dev);
 	if (!egm_chardev)
 		return -EINVAL;
 
+	hash_init(egm_chardev->htbl);
+
+	ret = nvgrace_egm_fetch_retired_pages(egm_dev, egm_chardev);
+	if (ret)
+		goto error_exit;
+
 	xa_store(&egm_chardevs, egm_dev->egmpxm, egm_chardev, GFP_KERNEL);
 
 	return 0;
+
+error_exit:
+	del_egm_chardev(egm_chardev);
+	return ret;
 }
 
 static void egm_driver_remove(struct auxiliary_device *aux_dev)
@@ -166,10 +238,19 @@ static void egm_driver_remove(struct auxiliary_device *aux_dev)
 	struct nvgrace_egm_dev *egm_dev =
 		container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
 	struct chardev *egm_chardev = xa_erase(&egm_chardevs, egm_dev->egmpxm);
+	struct h_node *cur_page;
+	unsigned long bkt;
+	struct hlist_node *temp_node;
 
 	if (!egm_chardev)
 		return;
 
+	hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
+		hash_del(&cur_page->node);
+		kvfree(cur_page);
+	}
+
+	cleanup_retired_pages(egm_chardev);
 	del_egm_chardev(egm_chardev);
 }
 
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
index ca50bc1f67a0..b8e143542bce 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
@@ -18,22 +18,41 @@ int nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm)
 }
 
 int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
-				   u64 *pegmlength)
+				   u64 *pegmlength, u64 *pretiredpagesphys)
 {
 	int ret;
 
 	/*
-	 * The memory information is present in the system ACPI tables as DSD
-	 * properties nvidia,egm-base-pa and nvidia,egm-size.
+	 * The EGM memory information is present in the system ACPI tables
+	 * as DSD properties nvidia,egm-base-pa and nvidia,egm-size.
 	 */
 	ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size",
 				       pegmlength);
 	if (ret)
-		return ret;
+		goto error_exit;
 
 	ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa",
 				       pegmphys);
+	if (ret)
+		goto error_exit;
+
+	/*
+	 * SBIOS puts the list of retired pages on a region. The region
+	 * SPA is exposed as "nvidia,egm-retired-pages-data-base".
+	 */
+	ret = device_property_read_u64(&pdev->dev,
+				       "nvidia,egm-retired-pages-data-base",
+				       pretiredpagesphys);
+	if (ret)
+		goto error_exit;
+
+	/* Catch firmware bug and avoid a crash */
+	if (*pretiredpagesphys == 0) {
+		dev_err(&pdev->dev, "Retired pages region is not setup\n");
+		ret = -EINVAL;
+	}
 
+error_exit:
 	return ret;
 }
 
@@ -74,7 +93,8 @@ static void nvgrace_gpu_release_aux_device(struct device *device)
 
 struct nvgrace_egm_dev *
 nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
-			      u64 egmphys, u64 egmlength, u64 egmpxm)
+			      u64 egmphys, u64 egmlength, u64 egmpxm,
+			      u64 retiredpagesphys)
 {
 	struct nvgrace_egm_dev *egm_dev;
 	int ret;
@@ -86,6 +106,8 @@ nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
 	egm_dev->egmpxm = egmpxm;
 	egm_dev->egmphys = egmphys;
 	egm_dev->egmlength = egmlength;
+	egm_dev->retiredpagesphys = retiredpagesphys;
+
 	INIT_LIST_HEAD(&egm_dev->gpus);
 
 	egm_dev->aux_dev.id = egmpxm;
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
index 2e1612445898..2f329a05685d 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
@@ -16,8 +16,9 @@ void remove_gpu(struct nvgrace_egm_dev *egm_dev, struct pci_dev *pdev);
 
 struct nvgrace_egm_dev *
 nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
-			      u64 egmphys, u64 egmlength, u64 egmpxm);
+			      u64 egmphys, u64 egmlength, u64 egmpxm,
+			      u64 retiredpagesphys);
 
 int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
-				   u64 *pegmlength);
+				   u64 *pegmlength, u64 *pretiredpagesphys);
 #endif /* EGM_DEV_H */
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index b1ccd1ac2e0a..534dc3ee6113 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -67,7 +67,7 @@ static struct list_head egm_dev_list;
 static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
 {
 	struct nvgrace_egm_dev_entry *egm_entry = NULL;
-	u64 egmphys, egmlength, egmpxm;
+	u64 egmphys, egmlength, egmpxm, retiredpagesphys;
 	int ret = 0;
 	bool is_new_region = false;
 
@@ -80,7 +80,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
 	if (nvgrace_gpu_has_egm_property(pdev, &egmpxm))
 		goto exit;
 
-	ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength);
+	ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength,
+					     &retiredpagesphys);
 	if (ret)
 		goto exit;
 
@@ -103,7 +104,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
 
 	egm_entry->egm_dev =
 		nvgrace_gpu_create_aux_device(pdev, NVGRACE_EGM_DEV_NAME,
-					      egmphys, egmlength, egmpxm);
+					      egmphys, egmlength, egmpxm,
+					      retiredpagesphys);
 	if (!egm_entry->egm_dev) {
 		ret = -EINVAL;
 		goto free_egm_entry;
diff --git a/include/linux/nvgrace-egm.h b/include/linux/nvgrace-egm.h
index a66906753267..197255c2a3b7 100644
--- a/include/linux/nvgrace-egm.h
+++ b/include/linux/nvgrace-egm.h
@@ -7,6 +7,7 @@
 #define NVGRACE_EGM_H
 
 #include <linux/auxiliary_bus.h>
+#include <linux/hashtable.h>
 
 #define NVGRACE_EGM_DEV_NAME "egm"
 
@@ -19,6 +20,7 @@ struct nvgrace_egm_dev {
 	struct auxiliary_device aux_dev;
 	phys_addr_t egmphys;
 	size_t egmlength;
+	phys_addr_t retiredpagesphys;
 	u64 egmpxm;
 	struct list_head gpus;
 };
-- 
2.34.1

next prev parent reply	other threads:[~2025-09-04  4:08 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-09-04  4:08 [RFC 00/14] cover-letter: Add virtualization support for EGM ankita
2025-09-04  4:08 ` [RFC 01/14] vfio/nvgrace-gpu: Expand module_pci_driver to allow custom module init ankita
2025-09-04  4:08 ` [RFC 02/14] vfio/nvgrace-gpu: Create auxiliary device for EGM ankita
2025-09-15  6:56   ` Shameer Kolothum
2025-09-04  4:08 ` [RFC 03/14] vfio/nvgrace-gpu: track GPUs associated with the EGM regions ankita
2025-09-15  7:19   ` Shameer Kolothum
2025-09-04  4:08 ` [RFC 04/14] vfio/nvgrace-gpu: Introduce functions to fetch and save EGM info ankita
2025-09-04  4:08 ` [RFC 05/14] vfio/nvgrace-egm: Introduce module to manage EGM ankita
2025-09-05 13:26   ` Jason Gunthorpe
2025-09-15  7:47   ` Shameer Kolothum
2025-09-04  4:08 ` [RFC 06/14] vfio/nvgrace-egm: Introduce egm class and register char device numbers ankita
2025-09-04  4:08 ` [RFC 07/14] vfio/nvgrace-egm: Register auxiliary driver ops ankita
2025-09-05 13:31   ` Jason Gunthorpe
2025-09-04  4:08 ` [RFC 08/14] vfio/nvgrace-egm: Expose EGM region as char device ankita
2025-09-05 13:34   ` Jason Gunthorpe
2025-09-15  8:36   ` Shameer Kolothum
2025-09-04  4:08 ` [RFC 09/14] vfio/nvgrace-egm: Add chardev ops for EGM management ankita
2025-09-05 13:36   ` Jason Gunthorpe
2025-09-04  4:08 ` [RFC 10/14] vfio/nvgrace-egm: Clear Memory before handing out to VM ankita
2025-09-05 13:39   ` Jason Gunthorpe
2025-09-15  8:45   ` Shameer Kolothum
2025-09-04  4:08 ` ankita [this message]
2025-09-15  9:21   ` [RFC 11/14] vfio/nvgrace-egm: Fetch EGM region retired pages list Shameer Kolothum
2025-09-04  4:08 ` [RFC 12/14] vfio/nvgrace-egm: Introduce ioctl to share retired pages ankita
2025-09-04  4:08 ` [RFC 13/14] vfio/nvgrace-egm: expose the egm size through sysfs ankita
2025-09-04  4:08 ` [RFC 14/14] vfio/nvgrace-gpu: Add link from pci to EGM ankita
2025-09-05 13:42   ` Jason Gunthorpe

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:bf1241ed1d6 dfblob:7a026b4d98f dfblob:ca50bc1f67a
dfblob:b8e143542bc dfblob:2e161244589 dfblob:2f329a05685
dfblob:b1ccd1ac2e0 dfblob:534dc3ee611 dfblob:a6690675326
dfblob:197255c2a3b )
 OR (
bs:"[RFC 11/14] vfio/nvgrace-egm: Fetch EGM region retired pages list" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250904040828.319452-12-ankita@nvidia.com \
    --to=ankita@nvidia.com \
    --cc=acurrid@nvidia.com \
    --cc=alex.williamson@redhat.com \
    --cc=aniketa@nvidia.com \
    --cc=anuaggarwal@nvidia.com \
    --cc=apopple@nvidia.com \
    --cc=cjia@nvidia.com \
    --cc=danw@nvidia.com \
    --cc=dnigam@nvidia.com \
    --cc=jgg@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=kevin.tian@intel.com \
    --cc=kjaju@nvidia.com \
    --cc=kvm@vger.kernel.org \
    --cc=kwankhede@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mochs@nvidia.com \
    --cc=skolothumtho@nvidia.com \
    --cc=targupta@nvidia.com \
    --cc=vsethi@nvidia.com \
    --cc=yi.l.liu@intel.com \
    --cc=yishaih@nvidia.com \
    --cc=zhiw@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox