All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] device/dax: Allow MCE recovery when accessing PFN metadata
@ 2025-12-30  7:59 Ruidong Tian
  2025-12-30 23:39 ` kernel test robot
  0 siblings, 1 reply; 2+ messages in thread
From: Ruidong Tian @ 2025-12-30  7:59 UTC (permalink / raw)
  To: dan.j.williams, vishal.l.verma, dave.jiang, tony.luck, bp,
	linux-cxl, linux-edac, linux-kernel, xueshuai
  Cc: Ruidong Tian

Both fsdax and devdax modes require significant space to store Page Frame
Number (PFN) metadata (struct page). For a 1TiB namespace, approximately
17.18GiB of metadata is needed[0]. As namespace sizes scale, hardware
memory errors within this metadata region become increasingly frequent.

Currently, the kernel treats any access to corrupted PFN metadata as an
unrecoverable event, leading to an immediate system panic. However, in
DAX scenarios (e.g., CXL-attached memory), the impact of metadata
corruption is logically confined to the physical device backing that
specific memory range.

Instead of a global panic, the kernel can ideally localize the failure.
By allowing the affected DAX memory range to be offlined or the specific
device to be decommissioned, we can limit the blast radius of hardware
errors. This enables other processes to migrate or exit gracefully
rather than being terminated by a system-wide crash.

Reproduce and testing:
1. Inject error to PFN metadata
2. mmap and read

Before apply this patch, kernel will panic:
  CPU 120: Machine Check Exception: f Bank 1: bd80000000100134
  RIP 10:<ffffffff8598300e> {dax_set_mapping.isra.0+0xce/0x140}
  TSC ee24b9e2d5 ADDR b213398000 MISC 86 PPIN 6deeb6484732971d
  PROCESSOR 0:a06d1 TIME 1765336050 SOCKET 0 APIC b1 microcode 10003f3
  Run the above through 'mcelog --ascii'
  Machine check: Data load in unrecoverable area of kernel
Kernel panic - not syncing: Fatal local machine check

After apply this patch:
User application receive SIGBUS, system still alive.

[0]: https://docs.pmem.io/ndctl-user-guide/managing-namespaces#fsdax-and-devdax-capacity-considerations

Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
---
 drivers/dax/dax-private.h | 26 ++++++++++++++++++++++++++
 drivers/dax/device.c      | 20 ++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 0867115aeef2..84325963fa3d 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -129,4 +129,30 @@ static inline bool dax_align_valid(unsigned long align)
 	return align == PAGE_SIZE;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifndef copy_mc_to_kernel
+static inline int dax_test_page_mc(const struct page *page)
+{
+	return 0;
+}
+static inline int dax_test_folio_mc(const struct folio *page)
+{
+	return 0;
+}
+#else
+#include <linux/uaccess.h>
+static inline int dax_test_page_mc(const struct page *page)
+{
+	struct page _p;
+
+	return copy_mc_to_kernel(&_p, page, sizeof(struct page));
+}
+static inline int dax_test_folio_mc(const struct folio *folio)
+{
+	struct folio _f;
+
+	return copy_mc_to_kernel(&_f, folio, sizeof(struct folio));
+}
+#endif
+#endif
 #endif
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 22999a402e02..a7f2217b9b62 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -80,7 +80,7 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
 	return -1;
 }
 
-static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn,
+static int dax_set_mapping(struct vm_fault *vmf, unsigned long pfn,
 			      unsigned long fault_size)
 {
 	unsigned long i, nr_pages = fault_size / PAGE_SIZE;
@@ -95,6 +95,13 @@ static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn,
 	pgoff = linear_page_index(vmf->vma,
 			ALIGN_DOWN(vmf->address, fault_size));
 
+	for (i = 0; i < nr_pages; i++) {
+		struct page *p = pfn_to_page(pfn + i);
+
+		if (dax_test_page_mc(p) || dax_test_page_mc(page_folio(p)))
+			return -EFAULT;
+	}
+
 	for (i = 0; i < nr_pages; i++) {
 		struct folio *folio = pfn_folio(pfn + i);
 
@@ -104,6 +111,8 @@ static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn,
 		folio->mapping = filp->f_mapping;
 		folio->index = pgoff + i;
 	}
+
+	return 0;
 }
 
 static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
@@ -134,7 +143,8 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
 
 	pfn = PHYS_PFN(phys);
 
-	dax_set_mapping(vmf, pfn, fault_size);
+	if (dax_set_mapping(vmf, pfn, fault_size))
+		return VM_FAULT_SIGBUS;
 
 	return vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn),
 					vmf->flags & FAULT_FLAG_WRITE);
@@ -178,7 +188,8 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 
 	pfn = PHYS_PFN(phys);
 
-	dax_set_mapping(vmf, pfn, fault_size);
+	if (dax_set_mapping(vmf, pfn, fault_size))
+		return VM_FAULT_SIGBUS;
 
 	return vmf_insert_folio_pmd(vmf, page_folio(pfn_to_page(pfn)),
 				vmf->flags & FAULT_FLAG_WRITE);
@@ -224,7 +235,8 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 
 	pfn = PHYS_PFN(phys);
 
-	dax_set_mapping(vmf, pfn, fault_size);
+	if (dax_set_mapping(vmf, pfn, fault_size))
+		return VM_FAULT_SIGBUS;
 
 	return vmf_insert_folio_pud(vmf, page_folio(pfn_to_page(pfn)),
 				vmf->flags & FAULT_FLAG_WRITE);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [RFC PATCH] device/dax: Allow MCE recovery when accessing PFN metadata
  2025-12-30  7:59 [RFC PATCH] device/dax: Allow MCE recovery when accessing PFN metadata Ruidong Tian
@ 2025-12-30 23:39 ` kernel test robot
  0 siblings, 0 replies; 2+ messages in thread
From: kernel test robot @ 2025-12-30 23:39 UTC (permalink / raw)
  To: Ruidong Tian; +Cc: oe-kbuild-all

Hi Ruidong,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:

[auto build test ERROR on cxl/next]
[also build test ERROR on linus/master v6.19-rc3 next-20251219]
[cannot apply to cxl/pending]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Ruidong-Tian/device-dax-Allow-MCE-recovery-when-accessing-PFN-metadata/20251230-160432
base:   https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git next
patch link:    https://lore.kernel.org/r/20251230075951.85252-1-tianruidong%40linux.alibaba.com
patch subject: [RFC PATCH] device/dax: Allow MCE recovery when accessing PFN metadata
config: x86_64-rhel-9.4-kunit (https://download.01.org/0day-ci/archive/20251231/202512310049.R86iwUtl-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251231/202512310049.R86iwUtl-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512310049.R86iwUtl-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from drivers/dax/super.c:16:
>> drivers/dax/dax-private.h:158:2: error: #endif without #if
     158 | #endif
         |  ^~~~~
--
   In file included from drivers/dax/device.c:13:
>> drivers/dax/dax-private.h:158:2: error: #endif without #if
     158 | #endif
         |  ^~~~~
   In file included from include/linux/mmzone.h:23,
                    from include/linux/memremap.h:5,
                    from drivers/dax/device.c:3:
   drivers/dax/device.c: In function 'dax_set_mapping':
>> include/linux/page-flags.h:308:33: error: passing argument 1 of 'dax_test_page_mc' from incompatible pointer type [-Wincompatible-pointer-types]
     306 | #define page_folio(p)           (_Generic((p),                          \
         |                                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     307 |         const struct page *:    (const struct folio *)_compound_head(p), \
         |         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     308 |         struct page *:          (struct folio *)_compound_head(p)))
         |         ~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |                                 |
         |                                 struct folio *
   drivers/dax/device.c:93:61: note: in expansion of macro 'page_folio'
      93 |                 if (dax_test_page_mc(p) || dax_test_page_mc(page_folio(p)))
         |                                                             ^~~~~~~~~~
   drivers/dax/dax-private.h:144:55: note: expected 'const struct page *' but argument is of type 'struct folio *'
     144 | static inline int dax_test_page_mc(const struct page *page)
         |                                    ~~~~~~~~~~~~~~~~~~~^~~~
--
   In file included from device.c:13:
   dax-private.h:158:2: error: #endif without #if
     158 | #endif
         |  ^~~~~
   In file included from include/linux/mmzone.h:23,
                    from include/linux/memremap.h:5,
                    from device.c:3:
   device.c: In function 'dax_set_mapping':
>> include/linux/page-flags.h:308:33: error: passing argument 1 of 'dax_test_page_mc' from incompatible pointer type [-Wincompatible-pointer-types]
     306 | #define page_folio(p)           (_Generic((p),                          \
         |                                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     307 |         const struct page *:    (const struct folio *)_compound_head(p), \
         |         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     308 |         struct page *:          (struct folio *)_compound_head(p)))
         |         ~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |                                 |
         |                                 struct folio *
   device.c:93:61: note: in expansion of macro 'page_folio'
      93 |                 if (dax_test_page_mc(p) || dax_test_page_mc(page_folio(p)))
         |                                                             ^~~~~~~~~~
   dax-private.h:144:55: note: expected 'const struct page *' but argument is of type 'struct folio *'
     144 | static inline int dax_test_page_mc(const struct page *page)
         |                                    ~~~~~~~~~~~~~~~~~~~^~~~


vim +158 drivers/dax/dax-private.h

b2485f92a36cf3 Ruidong Tian 2025-12-30  132  
b2485f92a36cf3 Ruidong Tian 2025-12-30  133  #ifndef copy_mc_to_kernel
b2485f92a36cf3 Ruidong Tian 2025-12-30  134  static inline int dax_test_page_mc(const struct page *page)
b2485f92a36cf3 Ruidong Tian 2025-12-30  135  {
b2485f92a36cf3 Ruidong Tian 2025-12-30  136  	return 0;
b2485f92a36cf3 Ruidong Tian 2025-12-30  137  }
b2485f92a36cf3 Ruidong Tian 2025-12-30  138  static inline int dax_test_folio_mc(const struct folio *page)
b2485f92a36cf3 Ruidong Tian 2025-12-30  139  {
b2485f92a36cf3 Ruidong Tian 2025-12-30  140  	return 0;
b2485f92a36cf3 Ruidong Tian 2025-12-30  141  }
b2485f92a36cf3 Ruidong Tian 2025-12-30  142  #else
b2485f92a36cf3 Ruidong Tian 2025-12-30  143  #include <linux/uaccess.h>
b2485f92a36cf3 Ruidong Tian 2025-12-30  144  static inline int dax_test_page_mc(const struct page *page)
b2485f92a36cf3 Ruidong Tian 2025-12-30  145  {
b2485f92a36cf3 Ruidong Tian 2025-12-30  146  	struct page _p;
b2485f92a36cf3 Ruidong Tian 2025-12-30  147  
b2485f92a36cf3 Ruidong Tian 2025-12-30  148  	return copy_mc_to_kernel(&_p, page, sizeof(struct page));
b2485f92a36cf3 Ruidong Tian 2025-12-30  149  }
b2485f92a36cf3 Ruidong Tian 2025-12-30  150  static inline int dax_test_folio_mc(const struct folio *folio)
b2485f92a36cf3 Ruidong Tian 2025-12-30  151  {
b2485f92a36cf3 Ruidong Tian 2025-12-30  152  	struct folio _f;
b2485f92a36cf3 Ruidong Tian 2025-12-30  153  
b2485f92a36cf3 Ruidong Tian 2025-12-30  154  	return copy_mc_to_kernel(&_f, folio, sizeof(struct folio));
b2485f92a36cf3 Ruidong Tian 2025-12-30  155  }
b2485f92a36cf3 Ruidong Tian 2025-12-30  156  #endif
b2485f92a36cf3 Ruidong Tian 2025-12-30  157  #endif
efebc711180f7f Dave Jiang   2017-04-07 @158  #endif

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-12-30 23:40 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-30  7:59 [RFC PATCH] device/dax: Allow MCE recovery when accessing PFN metadata Ruidong Tian
2025-12-30 23:39 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.