[PATCH 3/3] ia64: Call migration code on correctable errors v2

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 3/3] ia64: Call migration code on correctable errors v2
@ 2008-05-02  0:44 Russ Anderson
  2008-05-02  1:22 ` Christoph Lameter
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Russ Anderson @ 2008-05-02  0:44 UTC (permalink / raw)
  To: linux-kernel, linux-ia64
  Cc: Linus Torvalds, Andrew Morton, Tony Luck, Christoph Lameter,
	Russ Anderson

Migrate data off pages with correctable memory errors.  This patch is the 
ia64 specific piece.  It connects the CPE handler to the page migration
code.  It is implemented as a kernel loadable module, similar to the mca
recovery code (mca_recovery.ko).  This allows the feature to be turned off
by uninstalling the module.  Creates /proc/badram to display bad page
information and free bad pages.

Signed-off-by: Russ Anderson <rja@sgi.com>

---
 arch/ia64/Kconfig              |    3 
 arch/ia64/kernel/Makefile      |    1 
 arch/ia64/kernel/cpe_migrate.c |  382 +++++++++++++++++++++++++++++++++++++++++
 arch/ia64/kernel/mca.c         |   34 +++
 include/asm-ia64/mca.h         |    5 
 include/asm-ia64/page.h        |    1 
 include/linux/migrate.h        |    2 
 mm/migrate.c                   |    7 
 mm/page_alloc.c                |    1 
 9 files changed, 434 insertions(+), 2 deletions(-)

Index: linus/arch/ia64/kernel/Makefile
===================================================================
--- linus.orig/arch/ia64/kernel/Makefile	2008-05-01 19:36:40.000000000 -0500
+++ linus/arch/ia64/kernel/Makefile	2008-05-01 19:36:49.000000000 -0500
@@ -27,6 +27,7 @@ obj-$(CONFIG_PERFMON)		+= perfmon_defaul
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
+obj-$(CONFIG_IA64_CPE_MIGRATE)	+= cpe_migrate.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o jprobes.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
Index: linus/arch/ia64/kernel/cpe_migrate.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linus/arch/ia64/kernel/cpe_migrate.c	2008-05-01 19:36:49.000000000 -0500
@@ -0,0 +1,382 @@
+/*
+ * File:	cpe_migrate.c
+ * Purpose:	Migrate data from physical pages with excessive correctable
+ *		errors to new physical pages.  Keep the old pages on a discard
+ *		list.
+ *
+ * Copyright (C) 2008 SGI - Silicon Graphics Inc.
+ * Copyright (C) 2008 Russ Anderson <rja@sgi.com>
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kallsyms.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/workqueue.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+#include <linux/ptrace.h>
+#include <linux/irq.h>
+#include <linux/vmalloc.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/rmap.h>
+#include <linux/memcontrol.h>
+
+#include <asm/machvec.h>
+#include <asm/page.h>
+#include <asm/system.h>
+#include <asm/sal.h>
+#include <asm/hw_irq.h>
+#include <asm/sn/sn_cpuid.h>
+#include <asm/mca.h>
+
+#define BADRAM_BASENAME		"badram"
+#define CE_HISTORY_LENGTH	30
+
+static u64 cpe_paddr[CE_HISTORY_LENGTH];
+static u16 cpe_node[CE_HISTORY_LENGTH];
+static int cpe_polling_enabled = 1;
+static int cpe_head;
+static int cpe_tail;
+
+int work_scheduled;
+spinlock_t cpe_migrate_lock;
+
+void
+get_physical_address(void *buffer, u64 *paddr, u16 *node)
+{
+	sal_log_record_header_t *rh;
+	sal_log_mem_dev_err_info_t *mdei;
+	ia64_err_rec_t *err_rec;
+	sal_log_platform_err_info_t *plat_err;
+	efi_guid_t guid;
+
+	err_rec = (ia64_err_rec_t *)buffer;
+	rh = (sal_log_record_header_t *)&err_rec->sal_elog_header;
+	*paddr = 0;
+	*node = 0;
+
+	/*
+	 * Make sure it is a corrected error.
+	 */
+	if (rh->severity != sal_log_severity_corrected)
+		return;
+
+	plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err;
+
+	guid = (efi_guid_t)plat_err->mem_dev_err.header.guid;
+	if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) {
+		/*
+		 * Memory cpe
+		 */
+		mdei = (sal_log_mem_dev_err_info_t *)&plat_err->mem_dev_err;
+		if (mdei->valid.oem_data) {
+			if (mdei->valid.physical_addr)
+				*paddr = mdei->physical_addr;
+
+			if (mdei->valid.node) {
+				if (ia64_platform_is("sn2"))
+					*node = nasid_to_cnodeid(mdei->node);
+				else
+					*node = mdei->node;
+			}
+			return;
+		}
+	}
+	return;
+}
+
+struct page *
+alloc_migrate_page(struct page *ignored, unsigned long node, int **x)
+{
+
+	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
+}
+
+static int
+ia64_mca_cpe_move_page(u64 paddr, u32 node)
+{
+	LIST_HEAD(pagelist);
+	struct page *page;
+	int ret;
+	unsigned long irq_flags;
+
+	local_irq_save(irq_flags);
+	/*
+	 * Validate page
+	 */
+	if (!paddr)
+		return -1;
+	if (!ia64_phys_addr_valid(paddr))
+		return -1;
+	if (!pfn_valid(paddr >> PAGE_SHIFT))
+		return -1;
+
+	/*
+	 * convert physical address to page number
+	 */
+	page = phys_to_page(paddr);
+
+	if (!spin_trylock(&cpe_migrate_lock)) {
+		local_irq_restore(irq_flags);
+		return -1;
+	}
+
+	preempt_disable();
+	migrate_prep();
+	ret = isolate_lru_page(page, &pagelist);
+	preempt_enable();
+	if (ret) {
+		spin_unlock_irqrestore(&cpe_migrate_lock, irq_flags);
+		return ret;
+	}
+
+	SetPageMemError(page);		/* Mark the page as bad */
+	ret = migrate_pages(&pagelist, alloc_migrate_page, node);
+	if (ret == 0)
+		list_add_tail(&page->lru, &badpagelist);
+
+	spin_unlock_irqrestore(&cpe_migrate_lock, irq_flags);
+	return 0;
+}
+
+/*
+ * ia64_mca_cpe_migrate
+ *	The worker that does the actual migration.  It pulls a
+ *	physical address off the list and calls the migration code.
+ *
+ *  Inputs
+ *	none
+ *  Outputs
+ *	none
+ */
+static void
+ia64_mca_cpe_migrate(struct work_struct *unused)
+{
+	int ret;
+	u64 paddr;
+	u16 node;
+
+	do {
+		if (cpe_paddr[cpe_tail]) {
+			paddr = cpe_paddr[cpe_tail];
+			node = cpe_node[cpe_tail];
+
+			ret = ia64_mca_cpe_move_page(paddr, node);
+			if (ret <= 0)
+				/*
+				 * Even though the return status is negative,
+				 * clear the entry.  If the same address has
+				 * another CPE it will be re-added to the list.
+				 */
+				cpe_paddr[cpe_tail] = 0;
+
+		}
+		if (++cpe_tail >= CE_HISTORY_LENGTH)
+			cpe_tail = 0;
+
+	} while (cpe_tail != cpe_head);
+	work_scheduled = 0;
+}
+static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate);
+
+/*
+ * ce_setup_migrate
+ *	Get the physical address out of the CPE record, add it
+ *	to the list of addresses to migrate (if not already on),
+ *	and schedule the back end worker task.  This is called
+ *	in interrupt context so cannot directly call the migration
+ *	code.
+ *
+ *  Inputs
+ *	rec	The CPE record
+ *  Outputs
+ *	1 on Success, -1 on failure
+ */
+static int
+ce_setup_migrate(void *rec)
+{
+	u64 paddr;
+	u16 node;
+	/* int head, tail; */
+	int i;
+
+	if (!rec)
+		return -1;
+
+	get_physical_address(rec, &paddr, &node);
+	if (!paddr)
+		return -1;
+
+	if (!((cpe_head == cpe_tail) && (cpe_paddr[cpe_head] == 0)))
+		/*
+		 * List not empty
+		 */
+		for (i = 0; i < CE_HISTORY_LENGTH; i++)
+			if ((PAGE_ALIGN(cpe_paddr[i])) == PAGE_ALIGN(paddr))
+				return 1;	/* already on the list */
+
+	if (cpe_paddr[cpe_head] == 0) {
+		cpe_paddr[cpe_head] = paddr;
+		cpe_node[cpe_head] = node;
+
+		if (++cpe_head >= CE_HISTORY_LENGTH)
+			cpe_head = 0;
+	}
+
+	if (!work_scheduled) {
+		work_scheduled = 1;
+		schedule_work(&cpe_enable_work);
+	}
+
+	return 1;
+}
+
+/*
+ * =============================================================================
+ */
+
+int
+freeOneBadPage(unsigned long paddr)
+{
+	struct page *page, *page2, *target;
+
+	/*
+	 * Verify page address
+	 */
+	target = phys_to_page(paddr);
+	list_for_each_entry_safe(page, page2, &badpagelist, lru) {
+		if (page != target)
+			continue;
+
+		ClearPageMemError(page);        /* Mark the page as good */
+		move_to_lru(page);
+		list_del(&page->lru);
+		totalbad_pages--;
+		break;
+	}
+	return 0;
+}
+
+int
+freeAllBadPages(void)
+{
+	struct page *page, *page2;
+
+	list_for_each_entry_safe(page, page2, &badpagelist, lru) {
+		ClearPageMemError(page);        /* Mark the page as good */
+		totalbad_pages--;
+	}
+	putback_lru_pages(&badpagelist);
+	return 0;
+}
+
+#define OPT_LEN	16
+
+static ssize_t
+padpage_write(struct file *file, const char __user *user,
+		size_t count, loff_t *data)
+{
+	char optstr[OPT_LEN];
+	unsigned long opt;
+	int len = OPT_LEN;
+
+	if (count < len)
+		len = count;
+
+	if (copy_from_user(optstr, user, len))
+		return -EFAULT;
+	optstr[len] = '\0';
+
+	opt = simple_strtoul(optstr, NULL, 0);
+	if (opt == 0)
+		freeAllBadPages();
+	else
+		freeOneBadPage(opt);
+
+	return count;
+}
+
+int
+badpage_show(struct seq_file *file, void *data)
+{
+	struct page *page, *page2;
+	int i = 0;
+
+	seq_printf(file, "Bad RAM: %d kB, %d pages marked bad\n"
+			"List of bad physical pages\n",
+			totalbad_pages << (PAGE_SHIFT - 10), totalbad_pages);
+	list_for_each_entry_safe(page, page2, &badpagelist, lru) {
+		seq_printf(file, "  0x%011lx", page_to_phys(page));
+		if (!(++i % 5))
+			seq_printf(file, "\n");
+	}
+	seq_printf(file, "\n");
+	return 0;
+}
+
+int
+badpage_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, badpage_show, NULL);
+}
+
+static struct file_operations proc_badpage_operations = {
+	.open = badpage_open,
+	.write = padpage_write,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static struct proc_dir_entry *proc_badpage;
+
+int __init cpe_migrate_external_handler_init(void)
+{
+	spin_lock_init(&cpe_migrate_lock);
+
+	/* register external ce handler */
+	if (ia64_reg_CE_extension(ce_setup_migrate)) {
+		printk(KERN_ERR "ia64_reg_CE_extension failed.\n");
+		return -EFAULT;
+	}
+	cpe_poll_enabled = cpe_polling_enabled;
+
+	proc_badpage = create_proc_entry(BADRAM_BASENAME, 0644, NULL);
+	if (proc_badpage == NULL) {
+		printk(KERN_ERR "unable to create %s proc entry",
+				BADRAM_BASENAME);
+		return -EINVAL;
+	}
+	proc_badpage->proc_fops = &proc_badpage_operations;
+	printk(KERN_INFO "Registered badram Driver\n");
+	return 0;
+}
+
+void __exit cpe_migrate_external_handler_exit(void)
+{
+	/* unregister external mca handlers */
+	ia64_unreg_CE_extension();
+	remove_proc_entry(BADRAM_BASENAME, NULL);
+}
+
+module_init(cpe_migrate_external_handler_init);
+module_exit(cpe_migrate_external_handler_exit);
+
+module_param(cpe_polling_enabled, int, 0644);
+MODULE_PARM_DESC(cpe_polling_enabled,
+		"Enable polling with migration");
+
+MODULE_AUTHOR("Russ Anderson <rja@sgi.com>");
+MODULE_DESCRIPTION("ia64 Corrected Error page migration driver");
+MODULE_LICENSE("GPL");
Index: linus/arch/ia64/kernel/mca.c
===================================================================
--- linus.orig/arch/ia64/kernel/mca.c	2008-05-01 19:36:40.000000000 -0500
+++ linus/arch/ia64/kernel/mca.c	2008-05-01 19:36:49.000000000 -0500
@@ -68,6 +68,9 @@
  *
  * 2007-04-27 Russ Anderson <rja@sgi.com>
  *	      Support multiple cpus going through OS_MCA in the same event.
+ *
+ * 2008-04-22 Russ Anderson <rja@sgi.com>
+ *	      Migrate data off pages with correctable memory errors.
  */
 #include <linux/jiffies.h>
 #include <linux/types.h>
@@ -163,7 +166,11 @@ static int cmc_polling_enabled = 1;
  * but encounters problems retrieving CPE logs.  This should only be
  * necessary for debugging.
  */
-static int cpe_poll_enabled = 1;
+int cpe_poll_enabled = 1;
+EXPORT_SYMBOL(cpe_poll_enabled);
+
+LIST_HEAD(badpagelist);
+EXPORT_SYMBOL(badpagelist);
 
 extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
 
@@ -525,6 +532,28 @@ EXPORT_SYMBOL_GPL(mca_recover_range);
 
 #ifdef CONFIG_ACPI
 
+/* Function pointer to Corrected Error memory migration driver */
+int (*ia64_mca_ce_extension)(void *) = NULL;
+
+int
+ia64_reg_CE_extension(int (*fn)(void *))
+{
+	if (ia64_mca_ce_extension)
+		return 1;
+
+	ia64_mca_ce_extension = fn;
+	return 0;
+}
+EXPORT_SYMBOL(ia64_reg_CE_extension);
+
+void
+ia64_unreg_CE_extension(void)
+{
+	if (ia64_mca_ce_extension)
+		ia64_mca_ce_extension = NULL;
+}
+EXPORT_SYMBOL(ia64_unreg_CE_extension);
+
 int cpe_vector = -1;
 int ia64_cpe_irq = -1;
 
@@ -534,6 +563,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
 	static unsigned long	cpe_history[CPE_HISTORY_LENGTH];
 	static int		index;
 	static DEFINE_SPINLOCK(cpe_history_lock);
+	int recover;
 
 	IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
 		       __func__, cpe_irq, smp_processor_id());
@@ -580,6 +610,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
 out:
 	/* Get the CPE error record and log it */
 	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+	recover = (ia64_mca_ce_extension && ia64_mca_ce_extension(
+				IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE)));
 
 	return IRQ_HANDLED;
 }
Index: linus/arch/ia64/Kconfig
===================================================================
--- linus.orig/arch/ia64/Kconfig	2008-05-01 19:36:40.000000000 -0500
+++ linus/arch/ia64/Kconfig	2008-05-01 19:36:49.000000000 -0500
@@ -456,6 +456,9 @@ config COMPAT_FOR_U64_ALIGNMENT
 config IA64_MCA_RECOVERY
 	tristate "MCA recovery from errors other than TLB."
 
+config IA64_CPE_MIGRATE
+	tristate "Migrate data off pages with correctable errors"
+
 config PERFMON
 	bool "Performance monitor support"
 	help
Index: linus/include/asm-ia64/mca.h
===================================================================
--- linus.orig/include/asm-ia64/mca.h	2008-05-01 19:36:40.000000000 -0500
+++ linus/include/asm-ia64/mca.h	2008-05-01 19:36:49.000000000 -0500
@@ -137,6 +137,7 @@ extern unsigned long __per_cpu_mca[NR_CP
 
 extern int cpe_vector;
 extern int ia64_cpe_irq;
+extern int cpe_poll_enabled;
 extern void ia64_mca_init(void);
 extern void ia64_mca_cpu_init(void *);
 extern void ia64_os_mca_dispatch(void);
@@ -150,9 +151,13 @@ extern void ia64_slave_init_handler(void
 extern void ia64_mca_cmc_vector_setup(void);
 extern int  ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
 extern void ia64_unreg_MCA_extension(void);
+extern int  ia64_reg_CE_extension(int (*fn)(void *));
+extern void ia64_unreg_CE_extension(void);
 extern u64 ia64_get_rnat(u64 *);
 extern void ia64_mca_printk(const char * fmt, ...)
 	 __attribute__ ((format (printf, 1, 2)));
+
+extern struct list_head badpagelist;
 
 struct ia64_mca_notify_die {
 	struct ia64_sal_os_state *sos;
Index: linus/mm/migrate.c
===================================================================
--- linus.orig/mm/migrate.c	2008-05-01 19:36:40.000000000 -0500
+++ linus/mm/migrate.c	2008-05-01 19:36:49.000000000 -0500
@@ -64,6 +64,7 @@ int isolate_lru_page(struct page *page, 
 	}
 	return ret;
 }
+EXPORT_SYMBOL(isolate_lru_page);
 
 /*
  * migrate_prep() needs to be called before we start compiling a list of pages
@@ -81,8 +82,9 @@ int migrate_prep(void)
 
 	return 0;
 }
+EXPORT_SYMBOL(migrate_prep);
 
-static inline void move_to_lru(struct page *page)
+inline void move_to_lru(struct page *page)
 {
 	if (PageActive(page)) {
 		/*
@@ -96,6 +98,7 @@ static inline void move_to_lru(struct pa
 	}
 	put_page(page);
 }
+EXPORT_SYMBOL(move_to_lru);
 
 /*
  * Add isolated pages on the list back to the LRU.
@@ -115,6 +118,7 @@ int putback_lru_pages(struct list_head *
 	}
 	return count;
 }
+EXPORT_SYMBOL(putback_lru_pages);
 
 /*
  * Restore a potential migration pte to a working pte entry
@@ -805,6 +809,7 @@ out:
 
 	return nr_failed + retry;
 }
+EXPORT_SYMBOL(migrate_pages);
 
 #ifdef CONFIG_NUMA
 /*
Index: linus/include/asm-ia64/page.h
===================================================================
--- linus.orig/include/asm-ia64/page.h	2008-05-01 19:36:40.000000000 -0500
+++ linus/include/asm-ia64/page.h	2008-05-01 19:36:49.000000000 -0500
@@ -122,6 +122,7 @@ extern unsigned long max_low_pfn;
 #endif
 
 #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
+#define	phys_to_page(kaddr)	(pfn_to_page(kaddr >> PAGE_SHIFT))
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 #define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
 
Index: linus/mm/page_alloc.c
===================================================================
--- linus.orig/mm/page_alloc.c	2008-05-01 19:36:40.000000000 -0500
+++ linus/mm/page_alloc.c	2008-05-01 19:36:49.000000000 -0500
@@ -72,6 +72,7 @@ unsigned long totalreserve_pages __read_
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 unsigned int totalbad_pages;
+EXPORT_SYMBOL(totalbad_pages);
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
Index: linus/include/linux/migrate.h
===================================================================
--- linus.orig/include/linux/migrate.h	2008-05-01 19:36:40.000000000 -0500
+++ linus/include/linux/migrate.h	2008-05-01 19:36:49.000000000 -0500
@@ -38,6 +38,7 @@ extern int migrate_prep(void);
 extern int migrate_vmas(struct mm_struct *mm,
 		const nodemask_t *from, const nodemask_t *to,
 		unsigned long flags);
+extern void move_to_lru(struct page *);
 #else
 static inline int vma_migratable(struct vm_area_struct *vma)
 					{ return 0; }
@@ -59,6 +60,7 @@ static inline int migrate_vmas(struct mm
 {
 	return -ENOSYS;
 }
+static inline void move_to_lru(struct page *p) { return; }
 
 /* Possible settings for the migrate_page() method in address_operations */
 #define migrate_page NULL
-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02  0:44 [PATCH 3/3] ia64: Call migration code on correctable errors v2 Russ Anderson
@ 2008-05-02  1:22 ` Christoph Lameter
  2008-05-02  2:43   ` Russ Anderson
  2008-05-02  9:58 ` Pekka Enberg
  2008-05-02 17:45 ` Andi Kleen
  2 siblings, 1 reply; 11+ messages in thread
From: Christoph Lameter @ 2008-05-02  1:22 UTC (permalink / raw)
  To: Russ Anderson
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck

On Thu, 1 May 2008, Russ Anderson wrote:

> +int
> +freeOneBadPage(unsigned long paddr)
> +{
> +	struct page *page, *page2, *target;
> +
> +	/*
> +	 * Verify page address
> +	 */
> +	target = phys_to_page(paddr);
> +	list_for_each_entry_safe(page, page2, &badpagelist, lru) {
> +		if (page != target)
> +			continue;
> +
> +		ClearPageMemError(page);        /* Mark the page as good */
> +		move_to_lru(page);
> +		list_del(&page->lru);

This is the wrong order. move_to_lru() uses the lru field. So do the 
list_del first then the move. Note that there is a function 
putback_lru_pages() that is used to put a list of pages back to the lru if 
migration has failed and we give up on them. Could you use 
putback_lru_pages() and then avoid the exporting of move_to_lru()?


> +
> +int
> +freeAllBadPages(void)
> +{
> +	struct page *page, *page2;
> +
> +	list_for_each_entry_safe(page, page2, &badpagelist, lru) {
> +		ClearPageMemError(page);        /* Mark the page as good */
> +		totalbad_pages--;
> +	}
> +	putback_lru_pages(&badpagelist);
> +	return 0;
> +}

Ahh.. Here you use it.

> Index: linus/include/asm-ia64/page.h
> ===================================================================
> --- linus.orig/include/asm-ia64/page.h	2008-05-01 19:36:40.000000000 -0500
> +++ linus/include/asm-ia64/page.h	2008-05-01 19:36:49.000000000 -0500
> @@ -122,6 +122,7 @@ extern unsigned long max_low_pfn;
>  #endif
>  
>  #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
> +#define	phys_to_page(kaddr)	(pfn_to_page(kaddr >> PAGE_SHIFT))

#define phys_to_page virt_to_page ?


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02  1:22 ` Christoph Lameter
@ 2008-05-02  2:43   ` Russ Anderson
  0 siblings, 0 replies; 11+ messages in thread
From: Russ Anderson @ 2008-05-02  2:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck

On Thu, May 01, 2008 at 06:22:29PM -0700, Christoph Lameter wrote:
> On Thu, 1 May 2008, Russ Anderson wrote:
> 
> > +int
> > +freeOneBadPage(unsigned long paddr)
> > +{
> > +	struct page *page, *page2, *target;
> > +
> > +	/*
> > +	 * Verify page address
> > +	 */
> > +	target = phys_to_page(paddr);
> > +	list_for_each_entry_safe(page, page2, &badpagelist, lru) {
> > +		if (page != target)
> > +			continue;
> > +
> > +		ClearPageMemError(page);        /* Mark the page as good */
> > +		move_to_lru(page);
> > +		list_del(&page->lru);
> 
> This is the wrong order. move_to_lru() uses the lru field. So do the 
> list_del first then the move. Note that there is a function 
> putback_lru_pages() that is used to put a list of pages back to the lru if 
> migration has failed and we give up on them. Could you use 
> putback_lru_pages() and then avoid the exporting of move_to_lru()?

I'll put the page on a different list and call putback_lru_pages().

> > +
> > +int
> > +freeAllBadPages(void)
> > +{
> > +	struct page *page, *page2;
> > +
> > +	list_for_each_entry_safe(page, page2, &badpagelist, lru) {
> > +		ClearPageMemError(page);        /* Mark the page as good */
> > +		totalbad_pages--;
> > +	}
> > +	putback_lru_pages(&badpagelist);
> > +	return 0;
> > +}
> 
> Ahh.. Here you use it.

This is when deleting all the pages on the list.

> > Index: linus/include/asm-ia64/page.h
> > ===================================================================
> > --- linus.orig/include/asm-ia64/page.h	2008-05-01 19:36:40.000000000 -0500
> > +++ linus/include/asm-ia64/page.h	2008-05-01 19:36:49.000000000 -0500
> > @@ -122,6 +122,7 @@ extern unsigned long max_low_pfn;
> >  #endif
> >  
> >  #define page_to_phys(page)	(page_to_pfn(page) << PAGE_SHIFT)
> > +#define	phys_to_page(kaddr)	(pfn_to_page(kaddr >> PAGE_SHIFT))
> 
> #define phys_to_page virt_to_page ?

I'm not sure what you are asking.

#define phys_to_page(kaddr)     (pfn_to_page(kaddr >> PAGE_SHIFT))
#define virt_to_page(kaddr)     pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)

Do you mean change virt_to_page() like this?

#define virt_to_page(kaddr)     phys_to_page(__pa(kaddr))

Thanks,
-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02  0:44 [PATCH 3/3] ia64: Call migration code on correctable errors v2 Russ Anderson
  2008-05-02  1:22 ` Christoph Lameter
@ 2008-05-02  9:58 ` Pekka Enberg
  2008-05-02 16:40   ` Russ Anderson
  2008-05-02 17:45 ` Andi Kleen
  2 siblings, 1 reply; 11+ messages in thread
From: Pekka Enberg @ 2008-05-02  9:58 UTC (permalink / raw)
  To: Russ Anderson
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter

Hi Russ,

On Fri, May 2, 2008 at 3:44 AM, Russ Anderson <rja@sgi.com> wrote:
> Migrate data off pages with correctable memory errors.  This patch is the
>  ia64 specific piece.  It connects the CPE handler to the page migration
>  code.  It is implemented as a kernel loadable module, similar to the mca
>  recovery code (mca_recovery.ko).  This allows the feature to be turned off
>  by uninstalling the module.  Creates /proc/badram to display bad page
>  information and free bad pages.
>
>  Signed-off-by: Russ Anderson <rja@sgi.com>

I think sparse and checkpatch would have caught most of these but here goes:

>  +int work_scheduled;
>  +spinlock_t cpe_migrate_lock;

static?

>  +void
>  +get_physical_address(void *buffer, u64 *paddr, u16 *node)
>  +{

static?

>  +       sal_log_record_header_t *rh;
>  +       sal_log_mem_dev_err_info_t *mdei;
>  +       ia64_err_rec_t *err_rec;
>  +       sal_log_platform_err_info_t *plat_err;
>  +       efi_guid_t guid;
>  +
>  +       err_rec = (ia64_err_rec_t *)buffer;

Unnecessary cast from void *.

>  +       rh = (sal_log_record_header_t *)&err_rec->sal_elog_header;
>  +       *paddr = 0;
>  +       *node = 0;
>  +
>  +       /*
>  +        * Make sure it is a corrected error.
>  +        */
>  +       if (rh->severity != sal_log_severity_corrected)
>  +               return;
>  +
>  +       plat_err = (sal_log_platform_err_info_t *)&err_rec->proc_err;
>  +
>  +       guid = (efi_guid_t)plat_err->mem_dev_err.header.guid;
>  +       if (efi_guidcmp(guid, SAL_PLAT_MEM_DEV_ERR_SECT_GUID) == 0) {
>  +               /*
>  +                * Memory cpe
>  +                */
>  +               mdei = (sal_log_mem_dev_err_info_t *)&plat_err->mem_dev_err;
>  +               if (mdei->valid.oem_data) {
>  +                       if (mdei->valid.physical_addr)
>  +                               *paddr = mdei->physical_addr;
>  +
>  +                       if (mdei->valid.node) {
>  +                               if (ia64_platform_is("sn2"))
>  +                                       *node = nasid_to_cnodeid(mdei->node);
>  +                               else
>  +                                       *node = mdei->node;
>  +                       }
>  +                       return;

Unnecessary return statement.

>  +               }
>  +       }
>  +       return;

And here.

>  +}
>  +
>  +struct page *
>  +alloc_migrate_page(struct page *ignored, unsigned long node, int **x)

static

>  +{
>  +
>  +       return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
>  +}
>  +
>  +static int
>  +ia64_mca_cpe_move_page(u64 paddr, u32 node)
>  +{
>  +       LIST_HEAD(pagelist);
>  +       struct page *page;
>  +       int ret;
>  +       unsigned long irq_flags;
>  +
>  +       local_irq_save(irq_flags);
>  +       /*
>  +        * Validate page
>  +        */
>  +       if (!paddr)
>  +               return -1;

-EINVAL ?

>  +       if (!ia64_phys_addr_valid(paddr))
>  +               return -1;

and here

>  +       if (!pfn_valid(paddr >> PAGE_SHIFT))
>  +               return -1;

ditto

>  +
>  +       /*
>  +        * convert physical address to page number
>  +        */
>  +       page = phys_to_page(paddr);
>  +
>  +       if (!spin_trylock(&cpe_migrate_lock)) {
>  +               local_irq_restore(irq_flags);
>  +               return -1;

-EBUSY?

>  +       }
>  +
>  +       preempt_disable();

Why do we need to disable preempt here?

>  +       migrate_prep();
>  +       ret = isolate_lru_page(page, &pagelist);
>  +       preempt_enable();
>  +       if (ret) {
>  +               spin_unlock_irqrestore(&cpe_migrate_lock, irq_flags);
>  +               return ret;

goto would make the error handling cleaner here

>  +       }
>  +
>  +       SetPageMemError(page);          /* Mark the page as bad */
>  +       ret = migrate_pages(&pagelist, alloc_migrate_page, node);
>  +       if (ret == 0)
>  +               list_add_tail(&page->lru, &badpagelist);
>  +
>  +       spin_unlock_irqrestore(&cpe_migrate_lock, irq_flags);
>  +       return 0;
>  +}
>  +
>  +/*
>  + * ia64_mca_cpe_migrate
>  + *     The worker that does the actual migration.  It pulls a
>  + *     physical address off the list and calls the migration code.
>  + *
>  + *  Inputs
>  + *     none
>  + *  Outputs
>  + *     none
>  + */

kerneldoc?

>  +static void
>  +ia64_mca_cpe_migrate(struct work_struct *unused)
>  +{
>  +       int ret;
>  +       u64 paddr;
>  +       u16 node;
>  +
>  +       do {
>  +               if (cpe_paddr[cpe_tail]) {
>  +                       paddr = cpe_paddr[cpe_tail];
>  +                       node = cpe_node[cpe_tail];
>  +
>  +                       ret = ia64_mca_cpe_move_page(paddr, node);
>  +                       if (ret <= 0)
>  +                               /*
>  +                                * Even though the return status is negative,
>  +                                * clear the entry.  If the same address has
>  +                                * another CPE it will be re-added to the list.
>  +                                */
>  +                               cpe_paddr[cpe_tail] = 0;
>  +
>  +               }
>  +               if (++cpe_tail >= CE_HISTORY_LENGTH)
>  +                       cpe_tail = 0;
>  +
>  +       } while (cpe_tail != cpe_head);
>  +       work_scheduled = 0;
>  +}
>  +static DECLARE_WORK(cpe_enable_work, ia64_mca_cpe_migrate);
>  +
>  +/*
>  + * ce_setup_migrate
>  + *     Get the physical address out of the CPE record, add it
>  + *     to the list of addresses to migrate (if not already on),
>  + *     and schedule the back end worker task.  This is called
>  + *     in interrupt context so cannot directly call the migration
>  + *     code.
>  + *
>  + *  Inputs
>  + *     rec     The CPE record
>  + *  Outputs
>  + *     1 on Success, -1 on failure
>  + */
>  +static int
>  +ce_setup_migrate(void *rec)
>  +{
>  +       u64 paddr;
>  +       u16 node;
>  +       /* int head, tail; */
>  +       int i;
>  +
>  +       if (!rec)
>  +               return -1;

-EINVAL?

>  +
>  +       get_physical_address(rec, &paddr, &node);
>  +       if (!paddr)
>  +               return -1;

and here

>  +
>  +       if (!((cpe_head == cpe_tail) && (cpe_paddr[cpe_head] == 0)))
>  +               /*
>  +                * List not empty
>  +                */
>  +               for (i = 0; i < CE_HISTORY_LENGTH; i++)
>  +                       if ((PAGE_ALIGN(cpe_paddr[i])) == PAGE_ALIGN(paddr))
>  +                               return 1;       /* already on the list */
>  +
>  +       if (cpe_paddr[cpe_head] == 0) {
>  +               cpe_paddr[cpe_head] = paddr;
>  +               cpe_node[cpe_head] = node;
>  +
>  +               if (++cpe_head >= CE_HISTORY_LENGTH)
>  +                       cpe_head = 0;
>  +       }
>  +
>  +       if (!work_scheduled) {
>  +               work_scheduled = 1;
>  +               schedule_work(&cpe_enable_work);

So you must not schedule cpe_enable_work if it's already in progress. Why?

>  +       }
>  +
>  +       return 1;
>  +}
>  +
>  +/*
>  + * =============================================================================
>  + */
>  +
>  +int
>  +freeOneBadPage(unsigned long paddr)

naming and static

>  +{
>  +       struct page *page, *page2, *target;
>  +
>  +       /*
>  +        * Verify page address
>  +        */
>  +       target = phys_to_page(paddr);
>  +       list_for_each_entry_safe(page, page2, &badpagelist, lru) {
>  +               if (page != target)
>  +                       continue;
>  +
>  +               ClearPageMemError(page);        /* Mark the page as good */
>  +               move_to_lru(page);
>  +               list_del(&page->lru);
>  +               totalbad_pages--;
>  +               break;
>  +       }
>  +       return 0;
>  +}
>  +
>  +int
>  +freeAllBadPages(void)

here as well

>  +{
>  +       struct page *page, *page2;
>  +
>  +       list_for_each_entry_safe(page, page2, &badpagelist, lru) {
>  +               ClearPageMemError(page);        /* Mark the page as good */
>  +               totalbad_pages--;
>  +       }
>  +       putback_lru_pages(&badpagelist);
>  +       return 0;
>  +}
>  +
>  +#define OPT_LEN        16
>  +
>  +static ssize_t
>  +padpage_write(struct file *file, const char __user *user,
>  +               size_t count, loff_t *data)
>  +{
>  +       char optstr[OPT_LEN];
>  +       unsigned long opt;
>  +       int len = OPT_LEN;
>  +
>  +       if (count < len)
>  +               len = count;
>  +
>  +       if (copy_from_user(optstr, user, len))
>  +               return -EFAULT;
>  +       optstr[len] = '\0';
>  +
>  +       opt = simple_strtoul(optstr, NULL, 0);
>  +       if (opt == 0)
>  +               freeAllBadPages();
>  +       else
>  +               freeOneBadPage(opt);
>  +
>  +       return count;
>  +}
>  +
>  +int
>  +badpage_show(struct seq_file *file, void *data)
>  +{
>  +       struct page *page, *page2;
>  +       int i = 0;
>  +
>  +       seq_printf(file, "Bad RAM: %d kB, %d pages marked bad\n"
>  +                       "List of bad physical pages\n",
>  +                       totalbad_pages << (PAGE_SHIFT - 10), totalbad_pages);
>  +       list_for_each_entry_safe(page, page2, &badpagelist, lru) {
>  +               seq_printf(file, "  0x%011lx", page_to_phys(page));
>  +               if (!(++i % 5))
>  +                       seq_printf(file, "\n");
>  +       }
>  +       seq_printf(file, "\n");
>  +       return 0;
>  +}
>  +
>  +int
>  +badpage_open(struct inode *inode, struct file *file)
>  +{
>  +       return single_open(file, badpage_show, NULL);
>  +}
>  +
>  +static struct file_operations proc_badpage_operations = {
>  +       .open = badpage_open,
>  +       .write = padpage_write,
>  +       .read = seq_read,
>  +       .llseek = seq_lseek,
>  +       .release = seq_release,
>  +};
>  +
>  +static struct proc_dir_entry *proc_badpage;
>  +
>  +int __init cpe_migrate_external_handler_init(void)
>  +{
>  +       spin_lock_init(&cpe_migrate_lock);
>  +
>  +       /* register external ce handler */
>  +       if (ia64_reg_CE_extension(ce_setup_migrate)) {
>  +               printk(KERN_ERR "ia64_reg_CE_extension failed.\n");
>  +               return -EFAULT;
>  +       }
>  +       cpe_poll_enabled = cpe_polling_enabled;
>  +
>  +       proc_badpage = create_proc_entry(BADRAM_BASENAME, 0644, NULL);

Why is this file not in sysfs?

>  +       if (proc_badpage == NULL) {
>  +               printk(KERN_ERR "unable to create %s proc entry",
>  +                               BADRAM_BASENAME);
>  +               return -EINVAL;
>  +       }
>  +       proc_badpage->proc_fops = &proc_badpage_operations;
>  +       printk(KERN_INFO "Registered badram Driver\n");
>  +       return 0;
>  +}
>  +
>  +void __exit cpe_migrate_external_handler_exit(void)
>  +{
>  +       /* unregister external mca handlers */
>  +       ia64_unreg_CE_extension();
>  +       remove_proc_entry(BADRAM_BASENAME, NULL);
>  +}
>  +
>  +module_init(cpe_migrate_external_handler_init);
>  +module_exit(cpe_migrate_external_handler_exit);
>  +
>  +module_param(cpe_polling_enabled, int, 0644);
>  +MODULE_PARM_DESC(cpe_polling_enabled,
>  +               "Enable polling with migration");
>  +
>  +MODULE_AUTHOR("Russ Anderson <rja@sgi.com>");
>  +MODULE_DESCRIPTION("ia64 Corrected Error page migration driver");
>  +MODULE_LICENSE("GPL");
>  Index: linus/arch/ia64/kernel/mca.c
>  ===================================================================
>  --- linus.orig/arch/ia64/kernel/mca.c   2008-05-01 19:36:40.000000000 -0500
>  +++ linus/arch/ia64/kernel/mca.c        2008-05-01 19:36:49.000000000 -0500
>  @@ -68,6 +68,9 @@
>   *
>   * 2007-04-27 Russ Anderson <rja@sgi.com>
>   *           Support multiple cpus going through OS_MCA in the same event.
>  + *
>  + * 2008-04-22 Russ Anderson <rja@sgi.com>
>  + *           Migrate data off pages with correctable memory errors.
>   */
>   #include <linux/jiffies.h>
>   #include <linux/types.h>
>  @@ -163,7 +166,11 @@ static int cmc_polling_enabled = 1;
>   * but encounters problems retrieving CPE logs.  This should only be
>   * necessary for debugging.
>   */
>  -static int cpe_poll_enabled = 1;
>  +int cpe_poll_enabled = 1;
>  +EXPORT_SYMBOL(cpe_poll_enabled);
>  +
>  +LIST_HEAD(badpagelist);
>  +EXPORT_SYMBOL(badpagelist);
>
>   extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
>
>  @@ -525,6 +532,28 @@ EXPORT_SYMBOL_GPL(mca_recover_range);
>
>   #ifdef CONFIG_ACPI
>
>  +/* Function pointer to Corrected Error memory migration driver */
>  +int (*ia64_mca_ce_extension)(void *) = NULL;
>  +
>  +int
>  +ia64_reg_CE_extension(int (*fn)(void *))
>  +{
>  +       if (ia64_mca_ce_extension)
>  +               return 1;
>  +
>  +       ia64_mca_ce_extension = fn;
>  +       return 0;
>  +}
>  +EXPORT_SYMBOL(ia64_reg_CE_extension);
>  +
>  +void
>  +ia64_unreg_CE_extension(void)
>  +{
>  +       if (ia64_mca_ce_extension)
>  +               ia64_mca_ce_extension = NULL;
>  +}
>  +EXPORT_SYMBOL(ia64_unreg_CE_extension);
>  +
>   int cpe_vector = -1;
>   int ia64_cpe_irq = -1;
>
>  @@ -534,6 +563,7 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
>         static unsigned long    cpe_history[CPE_HISTORY_LENGTH];
>         static int              index;
>         static DEFINE_SPINLOCK(cpe_history_lock);
>  +       int recover;
>
>         IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n",
>                        __func__, cpe_irq, smp_processor_id());
>  @@ -580,6 +610,8 @@ ia64_mca_cpe_int_handler (int cpe_irq, v
>   out:
>         /* Get the CPE error record and log it */
>         ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
>  +       recover = (ia64_mca_ce_extension && ia64_mca_ce_extension(
>  +                               IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_CPE)));
>
>         return IRQ_HANDLED;
>   }
>  Index: linus/arch/ia64/Kconfig
>  ===================================================================
>  --- linus.orig/arch/ia64/Kconfig        2008-05-01 19:36:40.000000000 -0500
>  +++ linus/arch/ia64/Kconfig     2008-05-01 19:36:49.000000000 -0500
>  @@ -456,6 +456,9 @@ config COMPAT_FOR_U64_ALIGNMENT
>   config IA64_MCA_RECOVERY
>         tristate "MCA recovery from errors other than TLB."
>
>  +config IA64_CPE_MIGRATE
>  +       tristate "Migrate data off pages with correctable errors"
>  +
>   config PERFMON
>         bool "Performance monitor support"
>         help
>  Index: linus/include/asm-ia64/mca.h
>  ===================================================================
>  --- linus.orig/include/asm-ia64/mca.h   2008-05-01 19:36:40.000000000 -0500
>  +++ linus/include/asm-ia64/mca.h        2008-05-01 19:36:49.000000000 -0500
>  @@ -137,6 +137,7 @@ extern unsigned long __per_cpu_mca[NR_CP
>
>   extern int cpe_vector;
>   extern int ia64_cpe_irq;
>  +extern int cpe_poll_enabled;
>   extern void ia64_mca_init(void);
>   extern void ia64_mca_cpu_init(void *);
>   extern void ia64_os_mca_dispatch(void);
>  @@ -150,9 +151,13 @@ extern void ia64_slave_init_handler(void
>   extern void ia64_mca_cmc_vector_setup(void);
>   extern int  ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *));
>   extern void ia64_unreg_MCA_extension(void);
>  +extern int  ia64_reg_CE_extension(int (*fn)(void *));
>  +extern void ia64_unreg_CE_extension(void);
>   extern u64 ia64_get_rnat(u64 *);
>   extern void ia64_mca_printk(const char * fmt, ...)
>          __attribute__ ((format (printf, 1, 2)));
>  +
>  +extern struct list_head badpagelist;
>
>   struct ia64_mca_notify_die {
>         struct ia64_sal_os_state *sos;
>  Index: linus/mm/migrate.c
>  ===================================================================
>  --- linus.orig/mm/migrate.c     2008-05-01 19:36:40.000000000 -0500
>  +++ linus/mm/migrate.c  2008-05-01 19:36:49.000000000 -0500
>  @@ -64,6 +64,7 @@ int isolate_lru_page(struct page *page,
>         }
>         return ret;
>   }
>  +EXPORT_SYMBOL(isolate_lru_page);
>
>   /*
>   * migrate_prep() needs to be called before we start compiling a list of pages
>  @@ -81,8 +82,9 @@ int migrate_prep(void)
>
>         return 0;
>   }
>  +EXPORT_SYMBOL(migrate_prep);
>
>  -static inline void move_to_lru(struct page *page)
>  +inline void move_to_lru(struct page *page)
>   {
>         if (PageActive(page)) {
>                 /*
>  @@ -96,6 +98,7 @@ static inline void move_to_lru(struct pa
>         }
>         put_page(page);
>   }
>  +EXPORT_SYMBOL(move_to_lru);
>
>   /*
>   * Add isolated pages on the list back to the LRU.
>  @@ -115,6 +118,7 @@ int putback_lru_pages(struct list_head *
>         }
>         return count;
>   }
>  +EXPORT_SYMBOL(putback_lru_pages);
>
>   /*
>   * Restore a potential migration pte to a working pte entry
>  @@ -805,6 +809,7 @@ out:
>
>         return nr_failed + retry;
>   }
>  +EXPORT_SYMBOL(migrate_pages);
>
>   #ifdef CONFIG_NUMA
>   /*
>  Index: linus/include/asm-ia64/page.h
>  ===================================================================
>  --- linus.orig/include/asm-ia64/page.h  2008-05-01 19:36:40.000000000 -0500
>  +++ linus/include/asm-ia64/page.h       2008-05-01 19:36:49.000000000 -0500
>  @@ -122,6 +122,7 @@ extern unsigned long max_low_pfn;
>   #endif
>
>   #define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
>  +#define        phys_to_page(kaddr)     (pfn_to_page(kaddr >> PAGE_SHIFT))
>   #define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
>   #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
>
>  Index: linus/mm/page_alloc.c
>  ===================================================================
>  --- linus.orig/mm/page_alloc.c  2008-05-01 19:36:40.000000000 -0500
>  +++ linus/mm/page_alloc.c       2008-05-01 19:36:49.000000000 -0500
>  @@ -72,6 +72,7 @@ unsigned long totalreserve_pages __read_
>   long nr_swap_pages;
>   int percpu_pagelist_fraction;
>   unsigned int totalbad_pages;
>  +EXPORT_SYMBOL(totalbad_pages);
>
>   #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
>   int pageblock_order __read_mostly;
>  Index: linus/include/linux/migrate.h
>  ===================================================================
>  --- linus.orig/include/linux/migrate.h  2008-05-01 19:36:40.000000000 -0500
>  +++ linus/include/linux/migrate.h       2008-05-01 19:36:49.000000000 -0500
>  @@ -38,6 +38,7 @@ extern int migrate_prep(void);
>   extern int migrate_vmas(struct mm_struct *mm,
>                 const nodemask_t *from, const nodemask_t *to,
>                 unsigned long flags);
>  +extern void move_to_lru(struct page *);
>   #else
>   static inline int vma_migratable(struct vm_area_struct *vma)
>                                         { return 0; }
>  @@ -59,6 +60,7 @@ static inline int migrate_vmas(struct mm
>   {
>         return -ENOSYS;
>   }
>  +static inline void move_to_lru(struct page *p) { return; }

unnecessary return statement

>
>   /* Possible settings for the migrate_page() method in address_operations */
>   #define migrate_page NULL
>  --
>  Russ Anderson, OS RAS/Partitioning Project Lead
>  SGI - Silicon Graphics Inc          rja@sgi.com
>  --
>  To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>  the body of a message to majordomo@vger.kernel.org
>  More majordomo info at  http://vger.kernel.org/majordomo-info.html
>  Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02  9:58 ` Pekka Enberg
@ 2008-05-02 16:40   ` Russ Anderson
  2008-05-02 16:57     ` Pekka J Enberg
  0 siblings, 1 reply; 11+ messages in thread
From: Russ Anderson @ 2008-05-02 16:40 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter

On Fri, May 02, 2008 at 12:58:12PM +0300, Pekka Enberg wrote:
> Hi Russ,
> 
> On Fri, May 2, 2008 at 3:44 AM, Russ Anderson <rja@sgi.com> wrote:
> > Migrate data off pages with correctable memory errors.  This patch is the
> >  ia64 specific piece.  It connects the CPE handler to the page migration
> >  code.  It is implemented as a kernel loadable module, similar to the mca
> >  recovery code (mca_recovery.ko).  This allows the feature to be turned off
> >  by uninstalling the module.  Creates /proc/badram to display bad page
> >  information and free bad pages.
> >
> >  Signed-off-by: Russ Anderson <rja@sgi.com>
> 
> I think sparse and checkpatch would have caught most of these but here goes:

I did run checkpatch.pl.  The two warnings were a false positive and
the other I let slide.  I'll make the rest of your suggested changes.

---
linux> scripts/checkpatch.pl ../patches/cpe.migrate.v2
WARNING: Use #include <linux/mca.h> instead of <asm/mca.h>
#80: FILE: arch/ia64/kernel/cpe_migrate.c:41:
+#include <asm/mca.h>

WARNING: consider using strict_strtoul in preference to simple_strtoul
#340: FILE: arch/ia64/kernel/cpe_migrate.c:301:
+       opt = simple_strtoul(optstr, NULL, 0);

total: 0 errors, 2 warnings, 548 lines checked
---
 
> >  +       if (cpe_paddr[cpe_head] == 0) {
> >  +               cpe_paddr[cpe_head] = paddr;
> >  +               cpe_node[cpe_head] = node;
> >  +
> >  +               if (++cpe_head >= CE_HISTORY_LENGTH)
> >  +                       cpe_head = 0;
> >  +       }
> >  +
> >  +       if (!work_scheduled) {
> >  +               work_scheduled = 1;
> >  +               schedule_work(&cpe_enable_work);
> 
> So you must not schedule cpe_enable_work if it's already in progress. Why?

If there is already a worker thread scheduled, it will process all
the addresses on the queue, including new entries.  So all ce_setup_migrate()
needs to do is add the new entry to the queue.  The CPE interrupt can come in faster
than the worker thread can migrate the pages.  Scheduling another worker 
thread on each CPE interrupt when there is already one scheduled/running
would be overkill.


> >  +       proc_badpage = create_proc_entry(BADRAM_BASENAME, 0644, NULL);
> 
> Why is this file not in sysfs?

/proc seemed like a appropriate place.  Where would you suggest in sysfs?

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02 16:40   ` Russ Anderson
@ 2008-05-02 16:57     ` Pekka J Enberg
  2008-05-02 17:30       ` Russ Anderson
  0 siblings, 1 reply; 11+ messages in thread
From: Pekka J Enberg @ 2008-05-02 16:57 UTC (permalink / raw)
  To: Russ Anderson
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter, gregkh

Hi Russ,

On Fri, 2 May 2008, Russ Anderson wrote:
> > I think sparse and checkpatch would have caught most of these but here goes:
> 
> I did run checkpatch.pl.  The two warnings were a false positive and
> the other I let slide.  I'll make the rest of your suggested changes.
> 
> WARNING: consider using strict_strtoul in preference to simple_strtoul
> #340: FILE: arch/ia64/kernel/cpe_migrate.c:301:
> +       opt = simple_strtoul(optstr, NULL, 0);

Why do you think this is a false positive? We just converted SLUB over to 
use strict_stroul() as suggested by Andrew.

> > >  +       if (cpe_paddr[cpe_head] == 0) {
> > >  +               cpe_paddr[cpe_head] = paddr;
> > >  +               cpe_node[cpe_head] = node;
> > >  +
> > >  +               if (++cpe_head >= CE_HISTORY_LENGTH)
> > >  +                       cpe_head = 0;
> > >  +       }
> > >  +
> > >  +       if (!work_scheduled) {
> > >  +               work_scheduled = 1;
> > >  +               schedule_work(&cpe_enable_work);
> > 
> > So you must not schedule cpe_enable_work if it's already in progress. Why?
> 
> If there is already a worker thread scheduled, it will process all
> the addresses on the queue, including new entries.  So all ce_setup_migrate()
> needs to do is add the new entry to the queue.  The CPE interrupt can come in faster
> than the worker thread can migrate the pages.  Scheduling another worker 
> thread on each CPE interrupt when there is already one scheduled/running
> would be overkill.

Okay. Maybe a kthread would be cleaner here then (which sleeps when the 
buffer is empty)? I didn't notice any locking for cpe_paddr and cpe_node. 
Why is that?

> > >  +       proc_badpage = create_proc_entry(BADRAM_BASENAME, 0644, NULL);
> > 
> > Why is this file not in sysfs?
> 
> /proc seemed like a appropriate place.  Where would you suggest in sysfs?

Oh, /proc is for _process specific_ files although historically it has 
been (ab)used for other files as well. I think something like 
/sys/kernel/badram would be appropriate. Christoph, Greg?

			Pekka

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02 16:57     ` Pekka J Enberg
@ 2008-05-02 17:30       ` Russ Anderson
  0 siblings, 0 replies; 11+ messages in thread
From: Russ Anderson @ 2008-05-02 17:30 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter, gregkh

On Fri, May 02, 2008 at 07:57:44PM +0300, Pekka J Enberg wrote:
> Hi Russ,
> 
> On Fri, 2 May 2008, Russ Anderson wrote:
> > > I think sparse and checkpatch would have caught most of these but here goes:
> > 
> > I did run checkpatch.pl.  The two warnings were a false positive and
> > the other I let slide.  I'll make the rest of your suggested changes.
> > 
> > WARNING: consider using strict_strtoul in preference to simple_strtoul
> > #340: FILE: arch/ia64/kernel/cpe_migrate.c:301:
> > +       opt = simple_strtoul(optstr, NULL, 0);
> 
> Why do you think this is a false positive?

The false positive is:
---
WARNING: Use #include <linux/mca.h> instead of <asm/mca.h>
#77: FILE: arch/ia64/kernel/cpe_migrate.c:40:
+#include <asm/mca.h>
---

linux/mca.h is a different file than asm-ia64/mca.h, so using
<linux/mca.h> instead of <asm/mca.h> will cause build problems.


>                       We just converted SLUB over to 
> use strict_stroul() as suggested by Andrew.

I'll use strict_stroul().

> > > >  +       if (cpe_paddr[cpe_head] == 0) {
> > > >  +               cpe_paddr[cpe_head] = paddr;
> > > >  +               cpe_node[cpe_head] = node;
> > > >  +
> > > >  +               if (++cpe_head >= CE_HISTORY_LENGTH)
> > > >  +                       cpe_head = 0;
> > > >  +       }
> > > >  +
> > > >  +       if (!work_scheduled) {
> > > >  +               work_scheduled = 1;
> > > >  +               schedule_work(&cpe_enable_work);
> > > 
> > > So you must not schedule cpe_enable_work if it's already in progress. Why?
> > 
> > If there is already a worker thread scheduled, it will process all
> > the addresses on the queue, including new entries.  So all ce_setup_migrate()
> > needs to do is add the new entry to the queue.  The CPE interrupt can come in faster
> > than the worker thread can migrate the pages.  Scheduling another worker 
> > thread on each CPE interrupt when there is already one scheduled/running
> > would be overkill.
> 
> Okay. Maybe a kthread would be cleaner here then (which sleeps when the 
> buffer is empty)? I didn't notice any locking for cpe_paddr and cpe_node. 
> Why is that?

Do you mean a lock so that cpe_paddr & cpe_node are updated atomicly?
The hole being the new page is allocated on a different node than the
old page.


-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02  0:44 [PATCH 3/3] ia64: Call migration code on correctable errors v2 Russ Anderson
  2008-05-02  1:22 ` Christoph Lameter
  2008-05-02  9:58 ` Pekka Enberg
@ 2008-05-02 17:45 ` Andi Kleen
  2008-05-02 18:50   ` Russ Anderson
  2 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2008-05-02 17:45 UTC (permalink / raw)
  To: Russ Anderson
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter

Russ Anderson <rja@sgi.com> writes:

> Migrate data off pages with correctable memory errors.  This patch is the 
> ia64 specific piece.  It connects the CPE handler to the page migration
> code.  It is implemented as a kernel loadable module, similar to the mca
> recovery code (mca_recovery.ko).  This allows the feature to be turned off
> by uninstalling the module.  Creates /proc/badram to display bad page
> information and free bad pages.

How do you know what pages have excessive errors? And how is excessive defined?
Surely you don't keep  a per page error count? It's unclear from your patch. 

Anyways I don't think this should be ia64 specific, but generic code.

I also have my doubts about making such small code subsystems modules. Modules
always get rounded to pages so it ultimatively just wastes memory.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02 17:45 ` Andi Kleen
@ 2008-05-02 18:50   ` Russ Anderson
  2008-05-02 19:33     ` Andi Kleen
  0 siblings, 1 reply; 11+ messages in thread
From: Russ Anderson @ 2008-05-02 18:50 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter

On Fri, May 02, 2008 at 07:45:55PM +0200, Andi Kleen wrote:
> Russ Anderson <rja@sgi.com> writes:
> 
> > Migrate data off pages with correctable memory errors.  This patch is the 
> > ia64 specific piece.  It connects the CPE handler to the page migration
> > code.  It is implemented as a kernel loadable module, similar to the mca
> > recovery code (mca_recovery.ko).  This allows the feature to be turned off
> > by uninstalling the module.  Creates /proc/badram to display bad page
> > information and free bad pages.
> 
> How do you know what pages have excessive errors? And how is excessive defined?
> Surely you don't keep  a per page error count? It's unclear from your patch. 

The code migrates on the first correctable error on a page, so "excessive"
is one.  Yes, keeping a per page count gets problematic, especially as
memories get larger.  The issue of the right metric for when to migrate
will always be debatable and the "right" answer likely will depend on
the physical memory technology.

> Anyways I don't think this should be ia64 specific, but generic code.

The actual migration code is generic, in mm/migrate.c.

The ia64 kernel module piece is very arch specific.  It ties into the 
ia64 CPE handler.  It gets the page address from the CPE record, based
on the ia64 error handling spec.  Each arch will have a different way of
determining the physical address of the correctable error (for example).

> I also have my doubts about making such small code subsystems modules. Modules
> always get rounded to pages so it ultimatively just wastes memory.

CONFIG_IA64_CPE_MIGRATE=m builds it as module.
CONFIG_IA64_CPE_MIGRATE=y builds it as part of the kernel.

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02 18:50   ` Russ Anderson
@ 2008-05-02 19:33     ` Andi Kleen
  2008-05-02 20:27       ` Russ Anderson
  0 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2008-05-02 19:33 UTC (permalink / raw)
  To: Russ Anderson
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter

Russ Anderson wrote:
> On Fri, May 02, 2008 at 07:45:55PM +0200, Andi Kleen wrote:
>> Russ Anderson <rja@sgi.com> writes:
>>
>>> Migrate data off pages with correctable memory errors.  This patch is the 
>>> ia64 specific piece.  It connects the CPE handler to the page migration
>>> code.  It is implemented as a kernel loadable module, similar to the mca
>>> recovery code (mca_recovery.ko).  This allows the feature to be turned off
>>> by uninstalling the module.  Creates /proc/badram to display bad page
>>> information and free bad pages.
>> How do you know what pages have excessive errors? And how is excessive defined?
>> Surely you don't keep  a per page error count? It's unclear from your patch. 
> 
> The code migrates on the first correctable error on a page, so "excessive"
> is one.  Yes, keeping a per page count gets problematic, especially as
> memories get larger.  The issue of the right metric for when to migrate
> will always be debatable and the "right" answer likely will depend on
> the physical memory technology.

That doesn't sound like a good metric. What happens for example if you
have a flakey memory controller or DIMM socket that just occasionally
adds bit errors on transfer without the DIMM actually being bad? I doubt
such a simple heuristic is a good one.

The x86-64 machine check design has a database connected to a user space
program that can keep track of errors. I suspect something like this is
a better solution.

> 
>> Anyways I don't think this should be ia64 specific, but generic code.
> 
> The actual migration code is generic, in mm/migrate.c.
> 
> The ia64 kernel module piece is very arch specific.  It ties into the 
> ia64 CPE handler.  It gets the page address from the CPE record, based
> on the ia64 error handling spec.  Each arch will have a different way of
> determining the physical address of the correctable error (for example).

Still at least part of the code should be generic.

Also in general it's good that you finally realized that trying to write
lockless machine check handlers was a bad idea and that to do any
meaninful handling requires a proper process context. But I still wonder
when you move the rest of the ia64 machine check handling to that too so
that e.g. the sched.c hacks for lockless handling can be removed?


>> I also have my doubts about making such small code subsystems modules. Modules
>> always get rounded to pages so it ultimatively just wastes memory.
> 
> CONFIG_IA64_CPE_MIGRATE=m builds it as module.

It's far too small for it.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] ia64: Call migration code on correctable errors v2
  2008-05-02 19:33     ` Andi Kleen
@ 2008-05-02 20:27       ` Russ Anderson
  0 siblings, 0 replies; 11+ messages in thread
From: Russ Anderson @ 2008-05-02 20:27 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, linux-ia64, Linus Torvalds, Andrew Morton,
	Tony Luck, Christoph Lameter

On Fri, May 02, 2008 at 09:33:54PM +0200, Andi Kleen wrote:
> Russ Anderson wrote:
> > 
> > The code migrates on the first correctable error on a page, so "excessive"
> > is one.  Yes, keeping a per page count gets problematic, especially as
> > memories get larger.  The issue of the right metric for when to migrate
> > will always be debatable and the "right" answer likely will depend on
> > the physical memory technology.
> 
> That doesn't sound like a good metric. What happens for example if you
> have a flakey memory controller or DIMM socket that just occasionally
> adds bit errors on transfer without the DIMM actually being bad? I doubt
> such a simple heuristic is a good one.

Then occasionally pages on that DIMM get marked bad and are no longer used.
There have been cases where there only one or two correctable errors
before it degrades to and uncorrectable.  The "right" answer likely will 
depend on the physical memory technology.

Keep in mind the discarded pages can be later be freed.

> The x86-64 machine check design has a database connected to a user space
> program that can keep track of errors. I suspect something like this is
> a better solution.

I'll have to find out more.

> >> Anyways I don't think this should be ia64 specific, but generic code.
> > 
> > The actual migration code is generic, in mm/migrate.c.
> > 
> > The ia64 kernel module piece is very arch specific.  It ties into the 
> > ia64 CPE handler.  It gets the page address from the CPE record, based
> > on the ia64 error handling spec.  Each arch will have a different way of
> > determining the physical address of the correctable error (for example).
> 
> Still at least part of the code should be generic.

Part is generic.  What other parts do you want generic?
I'm not opposed to the idea, but things like parsing the
CPE record is platform specific.

> Also in general it's good that you finally realized that trying to write
> lockless machine check handlers was a bad idea and that to do any
> meaninful handling requires a proper process context. But I still wonder
> when you move the rest of the ia64 machine check handling to that too so
> that e.g. the sched.c hacks for lockless handling can be removed?

Are you refereing to ia64 MCAs that can surface at any time?  That is
an Intel CPU design issue far outside anything I can control.  CPEs 
are regular interrupts, which is why they can be handled in a sane
manner.

> >> I also have my doubts about making such small code subsystems modules. Modules
> >> always get rounded to pages so it ultimatively just wastes memory.
> > 
> > CONFIG_IA64_CPE_MIGRATE=m builds it as module.
> 
> It's far too small for it.

Maybe I do not understand the issue.  Is the issue wasting part of one
page?   Does building CONFIG_IA64_CPE_MIGRATE=y waste memory?

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-05-02 20:27 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-02  0:44 [PATCH 3/3] ia64: Call migration code on correctable errors v2 Russ Anderson
2008-05-02  1:22 ` Christoph Lameter
2008-05-02  2:43   ` Russ Anderson
2008-05-02  9:58 ` Pekka Enberg
2008-05-02 16:40   ` Russ Anderson
2008-05-02 16:57     ` Pekka J Enberg
2008-05-02 17:30       ` Russ Anderson
2008-05-02 17:45 ` Andi Kleen
2008-05-02 18:50   ` Russ Anderson
2008-05-02 19:33     ` Andi Kleen
2008-05-02 20:27       ` Russ Anderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox