From: Wu Fengguang <fengguang.wu@intel.com>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: LKML <linux-kernel@vger.kernel.org>,
Nick Piggin <npiggin@suse.de>,
Hugh Dickins <hugh.dickins@tiscali.co.uk>,
Wu Fengguang <fengguang.wu@intel.com>,
Andi Kleen <andi@firstfloor.org>,
"riel@redhat.com" <riel@redhat.com>,
"chris.mason@oracle.com" <chris.mason@oracle.com>,
"linux-mm@kvack.org" <linux-mm@kvack.org>
Subject: [PATCH 3/5] HWPOISON: remove early kill option for now
Date: Thu, 11 Jun 2009 22:22:42 +0800 [thread overview]
Message-ID: <20090611144430.682162784@intel.com> (raw)
In-Reply-To: 20090611142239.192891591@intel.com
[-- Attachment #1: hwpoison-remove-early-kill.patch --]
[-- Type: text/plain, Size: 14700 bytes --]
It needs more thoughts, and is not a must have for .31.
CC: Nick Piggin <npiggin@suse.de>
CC: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
Documentation/sysctl/vm.txt | 28 ---
include/linux/mm.h | 1
include/linux/rmap.h | 6
kernel/sysctl.c | 13 -
mm/filemap.c | 4
mm/memory-failure.c | 272 ----------------------------------
mm/rmap.c | 8 -
7 files changed, 3 insertions(+), 329 deletions(-)
--- sound-2.6.orig/mm/memory-failure.c
+++ sound-2.6/mm/memory-failure.c
@@ -48,251 +48,9 @@
#include <linux/backing-dev.h>
#include "internal.h"
-int sysctl_memory_failure_early_kill __read_mostly = 1;
-
atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
/*
- * Send all the processes who have the page mapped an ``action optional''
- * signal.
- */
-static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn)
-{
- struct siginfo si;
- int ret;
-
- printk(KERN_ERR
- "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
- si.si_signo = SIGBUS;
- si.si_errno = 0;
- si.si_code = BUS_MCEERR_AO;
- si.si_addr = (void *)addr;
-#ifdef __ARCH_SI_TRAPNO
- si.si_trapno = trapno;
-#endif
- si.si_addr_lsb = PAGE_SHIFT;
- /*
- * Don't use force here, it's convenient if the signal
- * can be temporarily blocked.
- * This could cause a loop when the user sets SIGBUS
- * to SIG_IGN, but hopefully noone will do that?
- */
- ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
- if (ret < 0)
- printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
- t->comm, t->pid, ret);
- return ret;
-}
-
-/*
- * Kill all processes that have a poisoned page mapped and then isolate
- * the page.
- *
- * General strategy:
- * Find all processes having the page mapped and kill them.
- * But we keep a page reference around so that the page is not
- * actually freed yet.
- * Then stash the page away
- *
- * There's no convenient way to get back to mapped processes
- * from the VMAs. So do a brute-force search over all
- * running processes.
- *
- * Remember that machine checks are not common (or rather
- * if they are common you have other problems), so this shouldn't
- * be a performance issue.
- *
- * Also there are some races possible while we get from the
- * error detection to actually handle it.
- */
-
-struct to_kill {
- struct list_head nd;
- struct task_struct *tsk;
- unsigned long addr;
- unsigned addr_valid:1;
-};
-
-/*
- * Failure handling: if we can't find or can't kill a process there's
- * not much we can do. We just print a message and ignore otherwise.
- */
-
-/*
- * Schedule a process for later kill.
- * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
- * TBD would GFP_NOIO be enough?
- */
-static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill,
- struct to_kill **tkc)
-{
- struct to_kill *tk;
-
- if (*tkc) {
- tk = *tkc;
- *tkc = NULL;
- } else {
- tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
- if (!tk) {
- printk(KERN_ERR
- "MCE: Out of memory while machine check handling\n");
- return;
- }
- }
- tk->addr = page_address_in_vma(p, vma);
- tk->addr_valid = 1;
-
- /*
- * In theory we don't have to kill when the page was
- * munmaped. But it could be also a mremap. Since that's
- * likely very rare kill anyways just out of paranoia, but use
- * a SIGKILL because the error is not contained anymore.
- */
- if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
- page_to_pfn(p), tsk->comm);
- tk->addr_valid = 0;
- }
- get_task_struct(tsk);
- tk->tsk = tsk;
- list_add_tail(&tk->nd, to_kill);
-}
-
-/*
- * Kill the processes that have been collected earlier.
- *
- * Only do anything when DOIT is set, otherwise just free the list
- * (this is used for clean pages which do not need killing)
- * Also when FAIL is set do a force kill because something went
- * wrong earlier.
- */
-static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, unsigned long pfn)
-{
- struct to_kill *tk, *next;
-
- list_for_each_entry_safe (tk, next, to_kill, nd) {
- if (doit) {
- /*
- * In case something went wrong with munmaping
- * make sure the process doesn't catch the
- * signal and then access the memory. Just kill it.
- * the signal handlers
- */
- if (fail || tk->addr_valid == 0) {
- printk(KERN_ERR
- "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
- force_sig(SIGKILL, tk->tsk);
- }
-
- /*
- * In theory the process could have mapped
- * something else on the address in-between. We could
- * check for that, but we need to tell the
- * process anyways.
- */
- else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn) < 0)
- printk(KERN_ERR
- "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
- }
- put_task_struct(tk->tsk);
- kfree(tk);
- }
-}
-
-/*
- * Collect processes when the error hit an anonymous page.
- */
-static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
-{
- struct vm_area_struct *vma;
- struct task_struct *tsk;
- struct anon_vma *av;
-
- read_lock(&tasklist_lock);
-
- av = page_lock_anon_vma(page);
- if (av == NULL) /* Not actually mapped anymore */
- goto out;
-
- for_each_process (tsk) {
- if (!tsk->mm)
- continue;
- list_for_each_entry (vma, &av->head, anon_vma_node) {
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
- }
- }
- page_unlock_anon_vma(av);
-out:
- read_unlock(&tasklist_lock);
-}
-
-/*
- * Collect processes when the error hit a file mapped page.
- */
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
-{
- struct vm_area_struct *vma;
- struct task_struct *tsk;
- struct prio_tree_iter iter;
- struct address_space *mapping = page_mapping(page);
-
- /*
- * A note on the locking order between the two locks.
- * We don't rely on this particular order.
- * If you have some other code that needs a different order
- * feel free to switch them around. Or add a reverse link
- * from mm_struct to task_struct, then this could be all
- * done without taking tasklist_lock and looping over all tasks.
- */
-
- read_lock(&tasklist_lock);
- spin_lock(&mapping->i_mmap_lock);
- for_each_process(tsk) {
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-
- if (!tsk->mm)
- continue;
-
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
- pgoff)
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
- }
- spin_unlock(&mapping->i_mmap_lock);
- read_unlock(&tasklist_lock);
-}
-
-/*
- * Collect the processes who have the corrupted page mapped to kill.
- * This is done in two steps for locking reasons.
- * First preallocate one tokill structure outside the spin locks,
- * so that we can kill at least one process reasonably reliable.
- */
-static void collect_procs(struct page *page, struct list_head *tokill)
-{
- struct to_kill *tk;
-
- tk = kmalloc(sizeof(struct to_kill), GFP_KERNEL);
- /* memory allocation failure is implicitly handled */
- if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk);
- else
- collect_procs_file(page, tokill, &tk);
- kfree(tk);
-}
-
-/*
* Error handlers for various types of pages.
*/
@@ -599,7 +357,6 @@ static void hwpoison_user_mappings(struc
int trapno)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
- int kill = sysctl_memory_failure_early_kill;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
@@ -633,7 +390,6 @@ static void hwpoison_user_mappings(struc
if (page_mkclean(p))
SetPageDirty(p);
else {
- kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
printk(KERN_INFO
"MCE %#lx: corrupted page was clean: dropped without side effects\n",
@@ -642,22 +398,6 @@ static void hwpoison_user_mappings(struc
}
/*
- * First collect all the processes that have the page
- * mapped. This has to be done before try_to_unmap,
- * because ttu takes the rmap data structures down.
- *
- * This also has the side effect to propagate the dirty
- * bit from PTEs into the struct page. This is needed
- * to actually decide if something needs to be killed
- * or errored, or if it's ok to just drop the page.
- *
- * Error handling: We ignore errors here because
- * there's nothing that can be done.
- */
- if (kill)
- collect_procs(p, &tokill);
-
- /*
* try_to_unmap can fail temporarily due to races.
* Try a few times (RED-PEN better strategy?)
*/
@@ -671,18 +411,6 @@ static void hwpoison_user_mappings(struc
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(p));
-
- /*
- * Now that the dirty bit has been propagated to the
- * struct page and all unmaps done we can decide if
- * killing is needed or not. Only kill when the page
- * was dirty, otherwise the tokill list is merely
- * freed. When there was a problem unmapping earlier
- * use a more force-full uncatchable kill to prevent
- * any accesses to the poisoned memory.
- */
- kill_procs_ao(&tokill, !!PageDirty(p), trapno,
- ret != SWAP_SUCCESS, pfn);
}
/**
--- sound-2.6.orig/Documentation/sysctl/vm.txt
+++ sound-2.6/Documentation/sysctl/vm.txt
@@ -32,7 +32,6 @@ Currently, these files are in /proc/sys/
- legacy_va_layout
- lowmem_reserve_ratio
- max_map_count
-- memory_failure_early_kill
- min_free_kbytes
- min_slab_ratio
- min_unmapped_ratio
@@ -54,6 +53,7 @@ Currently, these files are in /proc/sys/
- vfs_cache_pressure
- zone_reclaim_mode
+
==============================================================
block_dump
@@ -275,32 +275,6 @@ e.g., up to one or two maps per allocati
The default value is 65536.
-=============================================================
-
-memory_failure_early_kill:
-
-Control how to kill processes when uncorrected memory error (typically
-a 2bit error in a memory module) is detected in the background by hardware
-that cannot be handled by the kernel. In some cases (like the page
-still having a valid copy on disk) the kernel will handle the failure
-transparently without affecting any applications. But if there is
-no other uptodate copy of the data it will kill to prevent any data
-corruptions from propagating.
-
-1: Kill all processes that have the corrupted and not reloadable page mapped
-as soon as the corruption is detected. Note this is not supported
-for a few types of pages, like kernel internally allocated data or
-the swap cache, but works for the majority of user pages.
-
-0: Only unmap the corrupted page from all processes and only kill a process
-who tries to access it.
-
-The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
-handle this if they want to.
-
-This is only active on architectures/platforms with advanced machine
-check handling and depends on the hardware capabilities.
-
==============================================================
min_free_kbytes:
--- sound-2.6.orig/include/linux/mm.h
+++ sound-2.6/include/linux/mm.h
@@ -1331,7 +1331,6 @@ extern int account_locked_memory(struct
extern void refund_locked_memory(struct mm_struct *mm, size_t size);
extern void memory_failure(unsigned long pfn, int trapno);
-extern int sysctl_memory_failure_early_kill;
extern atomic_long_t mce_bad_pages;
#endif /* __KERNEL__ */
--- sound-2.6.orig/kernel/sysctl.c
+++ sound-2.6/kernel/sysctl.c
@@ -1319,19 +1319,6 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &scan_unevictable_handler,
},
-#ifdef CONFIG_MEMORY_FAILURE
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "memory_failure_early_kill",
- .data = &sysctl_memory_failure_early_kill,
- .maxlen = sizeof(sysctl_memory_failure_early_kill),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &zero,
- .extra2 = &one,
- },
-#endif
/*
* NOTE: do not add new entries to this table unless you have read
--- sound-2.6.orig/mm/filemap.c
+++ sound-2.6/mm/filemap.c
@@ -105,10 +105,6 @@
*
* ->task->proc_lock
* ->dcache_lock (proc_pid_lookup)
- *
- * (code doesn't rely on that order, so you could switch it around)
- * ->tasklist_lock (memory_failure, collect_procs_ao)
- * ->i_mmap_lock
*/
/*
--- sound-2.6.orig/mm/rmap.c
+++ sound-2.6/mm/rmap.c
@@ -36,10 +36,6 @@
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
* within inode_lock in __sync_single_inode)
- *
- * (code doesn't rely on that order so it could be switched around)
- * ->tasklist_lock
- * anon_vma->lock (memory_failure, collect_procs_anon)
*/
#include <linux/mm.h>
@@ -195,7 +191,7 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+static struct anon_vma *page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma;
unsigned long anon_mapping;
@@ -215,7 +211,7 @@ out:
return NULL;
}
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+static void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
spin_unlock(&anon_vma->lock);
rcu_read_unlock();
--- sound-2.6.orig/include/linux/rmap.h
+++ sound-2.6/include/linux/rmap.h
@@ -129,12 +129,6 @@ int try_to_munlock(struct page *);
int page_wrprotect(struct page *page, int *odirect_sync, int count_offset);
#endif
-/*
- * Called by memory-failure.c to kill processes.
- */
-struct anon_vma *page_lock_anon_vma(struct page *page);
-void page_unlock_anon_vma(struct anon_vma *anon_vma);
-
#else /* !CONFIG_MMU */
#define anon_vma_init() do {} while (0)
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2009-06-11 14:52 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-06-11 14:22 [PATCH 0/5] [RFC] HWPOISON incremental fixes Wu Fengguang
2009-06-11 14:22 ` [PATCH 1/5] HWPOISON: define VM_FAULT_HWPOISON to 0 when feature is disabled Wu Fengguang
2009-06-11 15:44 ` Rik van Riel
2009-06-12 10:00 ` Andi Kleen
2009-06-12 13:15 ` Wu Fengguang
2009-06-12 11:22 ` Ingo Molnar
2009-06-12 12:57 ` Wu Fengguang
2009-06-12 13:17 ` Ingo Molnar
2009-06-12 13:33 ` Wu Fengguang
2009-06-12 15:36 ` Ingo Molnar
2009-06-12 16:14 ` Wu Fengguang
2009-06-12 18:07 ` Alan Cox
2009-06-12 17:55 ` Theodore Tso
2009-06-12 13:58 ` Andi Kleen
2009-06-12 15:28 ` Linus Torvalds
2009-06-12 15:35 ` Ingo Molnar
2009-06-12 16:05 ` Rik van Riel
2009-06-12 16:37 ` H. Peter Anvin
2009-06-12 16:48 ` Ingo Molnar
2009-06-15 7:04 ` Nick Piggin
2009-06-15 6:52 ` Nick Piggin
2009-06-16 20:27 ` Russ Anderson
2009-06-17 7:51 ` Nick Piggin
2009-06-12 15:45 ` Ingo Molnar
2009-06-12 16:12 ` Linus Torvalds
2009-06-11 14:22 ` [PATCH 2/5] HWPOISON: fix tasklist_lock/anon_vma locking order Wu Fengguang
2009-06-11 15:59 ` Rik van Riel
2009-06-12 10:03 ` Andi Kleen
2009-06-12 10:07 ` Nick Piggin
2009-06-12 13:27 ` Wu Fengguang
2009-06-12 14:04 ` Wu Fengguang
2009-06-11 14:22 ` Wu Fengguang [this message]
2009-06-11 16:06 ` [PATCH 3/5] HWPOISON: remove early kill option for now Rik van Riel
2009-06-12 9:59 ` Andi Kleen
2009-06-11 14:22 ` [PATCH 4/5] HWPOISON: report sticky EIO for poisoned file Wu Fengguang
2009-06-11 16:31 ` Rik van Riel
2009-06-12 10:07 ` Andi Kleen
2009-06-12 13:41 ` Wu Fengguang
2009-06-11 14:22 ` [PATCH 5/5] HWPOISON: use the safer invalidate page for possible metadata pages Wu Fengguang
2009-06-11 16:36 ` Rik van Riel
2009-06-12 10:56 ` [PATCH 0/5] [RFC] HWPOISON incremental fixes Andi Kleen
2009-06-12 13:59 ` Wu Fengguang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090611144430.682162784@intel.com \
--to=fengguang.wu@intel.com \
--cc=akpm@linux-foundation.org \
--cc=andi@firstfloor.org \
--cc=chris.mason@oracle.com \
--cc=hugh.dickins@tiscali.co.uk \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=npiggin@suse.de \
--cc=riel@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).