linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jane Chu <jane.chu@oracle.com>
To: david@fromorbit.com, djwong@kernel.org, dan.j.williams@intel.com,
	hch@infradead.org, vishal.l.verma@intel.com,
	dave.jiang@intel.com, agk@redhat.com, snitzer@redhat.com,
	dm-devel@redhat.com, ira.weiny@intel.com, willy@infradead.org,
	vgoyal@redhat.com, linux-fsdevel@vger.kernel.org,
	nvdimm@lists.linux.dev, linux-kernel@vger.kernel.org,
	linux-xfs@vger.kernel.org
Subject: [PATCH v2 2/2] dax,pmem: Implement pmem based dax data recovery
Date: Fri,  5 Nov 2021 19:16:38 -0600	[thread overview]
Message-ID: <20211106011638.2613039-3-jane.chu@oracle.com> (raw)
In-Reply-To: <20211106011638.2613039-1-jane.chu@oracle.com>

For /dev/pmem based dax, enable DAX_OP_RECOVERY mode for
dax_direct_access to translate 'kaddr' over a range that
may contain poison(s); and enable dax_copy_to_iter to
read as much data as possible up till a poisoned page is
encountered; and enable dax_copy_from_iter to clear poison
among a page-aligned range, and then write the good data over.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
---
 drivers/md/dm.c       |  2 ++
 drivers/nvdimm/pmem.c | 75 ++++++++++++++++++++++++++++++++++++++++---
 fs/dax.c              | 24 +++++++++++---
 3 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dc354db22ef9..9b3dac916f22 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1043,6 +1043,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (!ti)
 		goto out;
 	if (!ti->type->dax_copy_from_iter) {
+		WARN_ON(mode == DAX_OP_RECOVERY);
 		ret = copy_from_iter(addr, bytes, i);
 		goto out;
 	}
@@ -1067,6 +1068,7 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (!ti)
 		goto out;
 	if (!ti->type->dax_copy_to_iter) {
+		WARN_ON(mode == DAX_OP_RECOVERY);
 		ret = copy_to_iter(addr, bytes, i);
 		goto out;
 	}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3dc99e0bf633..8ae6aa678c51 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
 	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
-					PFN_PHYS(nr_pages))))
+				 PFN_PHYS(nr_pages)) && mode == DAX_OP_NORMAL))
 		return -EIO;
 
 	if (kaddr)
@@ -303,20 +303,85 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
 }
 
 /*
- * Use the 'no check' versions of copy_from_iter_flushcache() and
- * copy_mc_to_iter() to bypass HARDENED_USERCOPY overhead. Bounds
- * checking, both file offset and device offset, is handled by
- * dax_iomap_actor()
+ * Even though the 'no check' versions of copy_from_iter_flushcache()
+ * and copy_mc_to_iter() are used to bypass HARDENED_USERCOPY overhead,
+ * 'read'/'write' aren't always safe when poison is consumed. They happen
+ * to be safe because the 'read'/'write' range has been guaranteed
+ * be free of poison(s) by a prior call to dax_direct_access() on the
+ * caller stack.
+ * But on a data recovery code path, the 'read'/'write' range is expected
+ * to contain poison(s), and so poison(s) is explicit checked, such that
+ * 'read' can fetch data from clean page(s) up till the first poison is
+ * encountered, and 'write' requires the range be page aligned in order
+ * to restore the poisoned page's memory type back to "rw" after clearing
+ * the poison(s).
+ * In the event of poison related failure, (size_t) -EIO is returned and
+ * caller may check the return value after casting it to (ssize_t).
+ *
+ * TODO: add support for CPUs that support MOVDIR64B instruction for
+ * faster poison clearing, and possibly smaller error blast radius.
  */
 static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i, int mode)
 {
+	phys_addr_t pmem_off;
+	size_t len, lead_off;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (unlikely(mode == DAX_OP_RECOVERY)) {
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) {
+			if (lead_off || !(PAGE_ALIGNED(bytes))) {
+				dev_warn(dev, "Found poison, but addr(%p) and/or bytes(%#lx) not page aligned\n",
+					addr, bytes);
+				return (size_t) -EIO;
+			}
+			pmem_off = PFN_PHYS(pgoff) + pmem->data_offset;
+			if (pmem_clear_poison(pmem, pmem_off, bytes) !=
+						BLK_STS_OK)
+				return (size_t) -EIO;
+		}
+	}
+
 	return _copy_from_iter_flushcache(addr, bytes, i);
 }
 
 static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i, int mode)
 {
+	int num_bad;
+	size_t len, lead_off;
+	unsigned long bad_pfn;
+	bool bad_pmem = false;
+	size_t adj_len = bytes;
+	sector_t sector, first_bad;
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+	struct device *dev = pmem->bb.dev;
+
+	if (unlikely(mode == DAX_OP_RECOVERY)) {
+		sector = PFN_PHYS(pgoff) / 512;
+		lead_off = (unsigned long)addr & ~PAGE_MASK;
+		len = PFN_PHYS(PFN_UP(lead_off + bytes));
+		if (pmem->bb.count)
+			bad_pmem = !!badblocks_check(&pmem->bb, sector,
+					len / 512, &first_bad, &num_bad);
+		if (bad_pmem) {
+			bad_pfn = PHYS_PFN(first_bad * 512);
+			if (bad_pfn == pgoff) {
+				dev_warn(dev, "Found poison in page: pgoff(%#lx)\n",
+					pgoff);
+				return -EIO;
+			}
+			adj_len = PFN_PHYS(bad_pfn - pgoff) - lead_off;
+			dev_WARN_ONCE(dev, (adj_len > bytes),
+					"out-of-range first_bad?");
+		}
+		if (adj_len == 0)
+			return (size_t) -EIO;
+	}
+
 	return _copy_mc_to_iter(addr, bytes, i);
 }
 
diff --git a/fs/dax.c b/fs/dax.c
index bea6df1498c3..7640be6b6a97 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1219,6 +1219,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		const size_t size = ALIGN(length + offset, PAGE_SIZE);
 		const sector_t sector = dax_iomap_sector(iomap, pos);
+		long nr_page = PHYS_PFN(size);
+		int dax_mode = DAX_OP_NORMAL;
 		ssize_t map_len;
 		pgoff_t pgoff;
 		void *kaddr;
@@ -1232,8 +1234,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		if (ret)
 			break;
 
-		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
-					    DAX_OP_NORMAL, &kaddr, NULL);
+		map_len = dax_direct_access(dax_dev, pgoff, nr_page, dax_mode,
+					    &kaddr, NULL);
+		if (unlikely(map_len == -EIO)) {
+			dax_mode = DAX_OP_RECOVERY;
+			map_len = dax_direct_access(dax_dev, pgoff, nr_page,
+						    dax_mode, &kaddr, NULL);
+		}
 		if (map_len < 0) {
 			ret = map_len;
 			break;
@@ -1252,11 +1259,20 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
 		 */
 		if (iov_iter_rw(iter) == WRITE)
 			xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
-					map_len, iter, DAX_OP_NORMAL);
+					map_len, iter, dax_mode);
 		else
 			xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
-					map_len, iter, DAX_OP_NORMAL);
+					map_len, iter, dax_mode);
 
+		/*
+		 * If dax data recovery is enacted via DAX_OP_RECOVERY,
+		 * recovery failure would be indicated by a -EIO return
+		 * in 'xfer' casted as (size_t).
+		 */
+		if ((ssize_t)xfer == -EIO) {
+			ret = -EIO;
+			break;
+		}
 		pos += xfer;
 		length -= xfer;
 		done += xfer;
-- 
2.18.4


  parent reply	other threads:[~2021-11-06  1:17 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-06  1:16 [PATCH v2 0/2] Dax poison recovery Jane Chu
2021-11-06  1:16 ` [PATCH v2 1/2] dax: Introduce normal and recovery dax operation modes Jane Chu
2021-11-06  1:50   ` Darrick J. Wong
2021-11-08 20:43     ` Jane Chu
2021-11-06 16:48   ` Dan Williams
2021-11-08 21:02     ` Jane Chu
2021-11-09  5:26       ` Ira Weiny
2021-11-09  6:04         ` Dan Williams
2021-11-06  1:16 ` Jane Chu [this message]
2021-11-06  2:04   ` [PATCH v2 2/2] dax,pmem: Implement pmem based dax data recovery Darrick J. Wong
2021-11-08 20:53     ` Jane Chu
2021-11-08 21:00     ` Jane Chu
2021-11-09  7:27   ` Christoph Hellwig
2021-11-09 18:48     ` Dan Williams
2021-11-09 19:52       ` Christoph Hellwig
2021-11-09 19:58       ` Jane Chu
2021-11-09 21:02         ` Dan Williams
2021-11-10 18:26           ` Jane Chu
2021-11-12 15:36             ` Mike Snitzer
2021-11-12 18:00               ` Jane Chu
2021-11-09 19:14     ` Jane Chu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211106011638.2613039-3-jane.chu@oracle.com \
    --to=jane.chu@oracle.com \
    --cc=agk@redhat.com \
    --cc=dan.j.williams@intel.com \
    --cc=dave.jiang@intel.com \
    --cc=david@fromorbit.com \
    --cc=djwong@kernel.org \
    --cc=dm-devel@redhat.com \
    --cc=hch@infradead.org \
    --cc=ira.weiny@intel.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=nvdimm@lists.linux.dev \
    --cc=snitzer@redhat.com \
    --cc=vgoyal@redhat.com \
    --cc=vishal.l.verma@intel.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).