From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
To: Michal Hocko <mhocko@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>,
Andrew Morton <akpm@linux-foundation.org>,
Vlastimil Babka <vbabka@suse.cz>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Stephen Rothwell <sfr@canb.auug.org.au>,
linux-mm@kvack.org, linux-next@vger.kernel.org,
linux-kernel@vger.kernel.org,
Andrea Arcangeli <aarcange@redhat.com>
Subject: Re: [linux-next: Tree for Jun 1] __khugepaged_exit rwsem_down_write_failed lockup
Date: Fri, 3 Jun 2016 17:43:47 +0900 [thread overview]
Message-ID: <20160603084347.GA502@swordfish> (raw)
In-Reply-To: <20160603072536.GB20676@dhcp22.suse.cz>
On (06/03/16 09:25), Michal Hocko wrote:
> > it's quite hard to trigger the bug (somehow), so I can't
> > follow up with more information as of now.
either I did something very silly fixing up the patch, or the
patch may be causing general protection faults on my system.
RIP collect_mm_slot() + 0x42/0x84
khugepaged
prepare_to_wait_event
maybe_pmd_mkwrite
kthread
_raw_sin_unlock_irq
ret_from_fork
kthread_create_on_node
collect_mm_slot() + 0x42/0x84 is
0000000000000328 <collect_mm_slot>:
328: 55 push %rbp
329: 48 89 e5 mov %rsp,%rbp
32c: 53 push %rbx
32d: 48 8b 5f 20 mov 0x20(%rdi),%rbx
331: 8b 43 48 mov 0x48(%rbx),%eax
334: ff c8 dec %eax
336: 7f 71 jg 3a9 <collect_mm_slot+0x81>
338: 48 8b 57 08 mov 0x8(%rdi),%rdx
33c: 48 85 d2 test %rdx,%rdx
33f: 74 1e je 35f <collect_mm_slot+0x37>
341: 48 8b 07 mov (%rdi),%rax
344: 48 85 c0 test %rax,%rax
347: 48 89 02 mov %rax,(%rdx)
34a: 74 04 je 350 <collect_mm_slot+0x28>
34c: 48 89 50 08 mov %rdx,0x8(%rax)
350: 48 c7 07 00 00 00 00 movq $0x0,(%rdi)
357: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi)
35e: 00
35f: 48 8b 57 10 mov 0x10(%rdi),%rdx
363: 48 8b 47 18 mov 0x18(%rdi),%rax
367: 48 89 fe mov %rdi,%rsi
36a: 48 89 42 08 mov %rax,0x8(%rdx)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36e: 48 89 10 mov %rdx,(%rax)
371: 48 b8 00 01 00 00 00 movabs $0xdead000000000100,%rax
378: 00 ad de
37b: 48 89 47 10 mov %rax,0x10(%rdi)
37f: 48 b8 00 02 00 00 00 movabs $0xdead000000000200,%rax
386: 00 ad de
389: 48 89 47 18 mov %rax,0x18(%rdi)
38d: 48 8b 3d 00 00 00 00 mov 0x0(%rip),%rdi # 394 <collect_mm_slot+0x6c>
394: e8 00 00 00 00 callq 399 <collect_mm_slot+0x71>
399: f0 ff 4b 4c lock decl 0x4c(%rbx)
39d: 74 02 je 3a1 <collect_mm_slot+0x79>
39f: eb 08 jmp 3a9 <collect_mm_slot+0x81>
3a1: 48 89 df mov %rbx,%rdi
3a4: e8 00 00 00 00 callq 3a9 <collect_mm_slot+0x81>
3a9: 5b pop %rbx
3aa: 5d pop %rbp
3ab: c3 retq
which is list_del(&mm_slot->mm_node), I believe.
I attached the patch (just in case).
---
mm/huge_memory.c | 87 +++++++++++++++++++++++++-------------------------------
1 file changed, 39 insertions(+), 48 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 292cedd..1c82fa4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1938,7 +1938,8 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ /* the only pin is from khugepaged_scan_mm_slot */
+ return atomic_read(&mm->mm_users) <= 1;
}
int __khugepaged_enter(struct mm_struct *mm)
@@ -1950,8 +1951,6 @@ int __khugepaged_enter(struct mm_struct *mm)
if (!mm_slot)
return -ENOMEM;
- /* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -1994,36 +1993,40 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
return 0;
}
-void __khugepaged_exit(struct mm_struct *mm)
+static void collect_mm_slot(struct mm_slot *mm_slot)
{
- struct mm_slot *mm_slot;
- int free = 0;
+ struct mm_struct *mm = mm_slot->mm;
- spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
- free = 1;
- }
- spin_unlock(&khugepaged_mm_lock);
- if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- free_mm_slot(mm_slot);
- mmdrop(mm);
- } else if (mm_slot) {
/*
- * This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_sem.
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
*/
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot) {
+ collect_mm_slot(mm_slot);
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
}
+ spin_unlock(&khugepaged_mm_lock);
}
static void release_pte_page(struct page *page)
@@ -2738,29 +2741,6 @@ out:
return ret;
}
-static void collect_mm_slot(struct mm_slot *mm_slot)
-{
- struct mm_struct *mm = mm_slot->mm;
-
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
-
- if (khugepaged_test_exit(mm)) {
- /* free mm_slot */
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
-
- /*
- * Not strictly needed because the mm exited already.
- *
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- */
-
- /* khugepaged_mm_lock actually not necessary for the below */
- free_mm_slot(mm_slot);
- mmdrop(mm);
- }
-}
-
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
@@ -2782,6 +2762,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
+
+ /*
+ * Do not even try to do anything if the current mm is already
+ * dead. khugepaged_mm_lock will make sure only this or
+ * __khugepaged_exit does the unhasing.
+ */
+ if (!atomic_inc_not_zero(&mm_slot->mm->mm_users)) {
+ collect_mm_slot(mm_slot);
+ return progress;
+ }
spin_unlock(&khugepaged_mm_lock);
mm = mm_slot->mm;
@@ -2865,6 +2855,7 @@ breakouterloop_mmap_sem:
collect_mm_slot(mm_slot);
}
+ mmput_async(mm);
return progress;
}
--
2.9.0.rc1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
WARNING: multiple messages have this Message-ID (diff)
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
To: Michal Hocko <mhocko@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>,
Andrew Morton <akpm@linux-foundation.org>,
Vlastimil Babka <vbabka@suse.cz>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Stephen Rothwell <sfr@canb.auug.org.au>,
linux-mm@kvack.org, linux-next@vger.kernel.org,
linux-kernel@vger.kernel.org,
Andrea Arcangeli <aarcange@redhat.com>
Subject: Re: [linux-next: Tree for Jun 1] __khugepaged_exit rwsem_down_write_failed lockup
Date: Fri, 3 Jun 2016 17:43:47 +0900 [thread overview]
Message-ID: <20160603084347.GA502@swordfish> (raw)
In-Reply-To: <20160603072536.GB20676@dhcp22.suse.cz>
On (06/03/16 09:25), Michal Hocko wrote:
> > it's quite hard to trigger the bug (somehow), so I can't
> > follow up with more information as of now.
either I did something very silly fixing up the patch, or the
patch may be causing general protection faults on my system.
RIP collect_mm_slot() + 0x42/0x84
khugepaged
prepare_to_wait_event
maybe_pmd_mkwrite
kthread
_raw_sin_unlock_irq
ret_from_fork
kthread_create_on_node
collect_mm_slot() + 0x42/0x84 is
0000000000000328 <collect_mm_slot>:
328: 55 push %rbp
329: 48 89 e5 mov %rsp,%rbp
32c: 53 push %rbx
32d: 48 8b 5f 20 mov 0x20(%rdi),%rbx
331: 8b 43 48 mov 0x48(%rbx),%eax
334: ff c8 dec %eax
336: 7f 71 jg 3a9 <collect_mm_slot+0x81>
338: 48 8b 57 08 mov 0x8(%rdi),%rdx
33c: 48 85 d2 test %rdx,%rdx
33f: 74 1e je 35f <collect_mm_slot+0x37>
341: 48 8b 07 mov (%rdi),%rax
344: 48 85 c0 test %rax,%rax
347: 48 89 02 mov %rax,(%rdx)
34a: 74 04 je 350 <collect_mm_slot+0x28>
34c: 48 89 50 08 mov %rdx,0x8(%rax)
350: 48 c7 07 00 00 00 00 movq $0x0,(%rdi)
357: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi)
35e: 00
35f: 48 8b 57 10 mov 0x10(%rdi),%rdx
363: 48 8b 47 18 mov 0x18(%rdi),%rax
367: 48 89 fe mov %rdi,%rsi
36a: 48 89 42 08 mov %rax,0x8(%rdx)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36e: 48 89 10 mov %rdx,(%rax)
371: 48 b8 00 01 00 00 00 movabs $0xdead000000000100,%rax
378: 00 ad de
37b: 48 89 47 10 mov %rax,0x10(%rdi)
37f: 48 b8 00 02 00 00 00 movabs $0xdead000000000200,%rax
386: 00 ad de
389: 48 89 47 18 mov %rax,0x18(%rdi)
38d: 48 8b 3d 00 00 00 00 mov 0x0(%rip),%rdi # 394 <collect_mm_slot+0x6c>
394: e8 00 00 00 00 callq 399 <collect_mm_slot+0x71>
399: f0 ff 4b 4c lock decl 0x4c(%rbx)
39d: 74 02 je 3a1 <collect_mm_slot+0x79>
39f: eb 08 jmp 3a9 <collect_mm_slot+0x81>
3a1: 48 89 df mov %rbx,%rdi
3a4: e8 00 00 00 00 callq 3a9 <collect_mm_slot+0x81>
3a9: 5b pop %rbx
3aa: 5d pop %rbp
3ab: c3 retq
which is list_del(&mm_slot->mm_node), I believe.
I attached the patch (just in case).
---
mm/huge_memory.c | 87 +++++++++++++++++++++++++-------------------------------
1 file changed, 39 insertions(+), 48 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 292cedd..1c82fa4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1938,7 +1938,8 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ /* the only pin is from khugepaged_scan_mm_slot */
+ return atomic_read(&mm->mm_users) <= 1;
}
int __khugepaged_enter(struct mm_struct *mm)
@@ -1950,8 +1951,6 @@ int __khugepaged_enter(struct mm_struct *mm)
if (!mm_slot)
return -ENOMEM;
- /* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -1994,36 +1993,40 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
return 0;
}
-void __khugepaged_exit(struct mm_struct *mm)
+static void collect_mm_slot(struct mm_slot *mm_slot)
{
- struct mm_slot *mm_slot;
- int free = 0;
+ struct mm_struct *mm = mm_slot->mm;
- spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
- free = 1;
- }
- spin_unlock(&khugepaged_mm_lock);
- if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- free_mm_slot(mm_slot);
- mmdrop(mm);
- } else if (mm_slot) {
/*
- * This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_sem.
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
*/
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot) {
+ collect_mm_slot(mm_slot);
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
}
+ spin_unlock(&khugepaged_mm_lock);
}
static void release_pte_page(struct page *page)
@@ -2738,29 +2741,6 @@ out:
return ret;
}
-static void collect_mm_slot(struct mm_slot *mm_slot)
-{
- struct mm_struct *mm = mm_slot->mm;
-
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
-
- if (khugepaged_test_exit(mm)) {
- /* free mm_slot */
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
-
- /*
- * Not strictly needed because the mm exited already.
- *
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- */
-
- /* khugepaged_mm_lock actually not necessary for the below */
- free_mm_slot(mm_slot);
- mmdrop(mm);
- }
-}
-
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
@@ -2782,6 +2762,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
+
+ /*
+ * Do not even try to do anything if the current mm is already
+ * dead. khugepaged_mm_lock will make sure only this or
+ * __khugepaged_exit does the unhasing.
+ */
+ if (!atomic_inc_not_zero(&mm_slot->mm->mm_users)) {
+ collect_mm_slot(mm_slot);
+ return progress;
+ }
spin_unlock(&khugepaged_mm_lock);
mm = mm_slot->mm;
@@ -2865,6 +2855,7 @@ breakouterloop_mmap_sem:
collect_mm_slot(mm_slot);
}
+ mmput_async(mm);
return progress;
}
--
2.9.0.rc1
next prev parent reply other threads:[~2016-06-03 8:43 UTC|newest]
Thread overview: 56+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-06-01 3:11 linux-next: Tree for Jun 1 Stephen Rothwell
2016-06-02 1:48 ` [linux-next: Tree for Jun 1] __khugepaged_exit rwsem_down_write_failed lockup Sergey Senozhatsky
2016-06-02 1:48 ` Sergey Senozhatsky
2016-06-02 9:21 ` Michal Hocko
2016-06-02 9:21 ` Michal Hocko
2016-06-02 12:08 ` Sergey Senozhatsky
2016-06-02 12:08 ` Sergey Senozhatsky
2016-06-02 12:21 ` Michal Hocko
2016-06-02 12:21 ` Michal Hocko
2016-06-03 13:51 ` Andrea Arcangeli
2016-06-03 13:51 ` Andrea Arcangeli
2016-06-03 14:46 ` Michal Hocko
2016-06-03 14:46 ` Michal Hocko
2016-06-03 15:10 ` Andrea Arcangeli
2016-06-03 15:10 ` Andrea Arcangeli
2016-06-07 7:34 ` Michal Hocko
2016-06-07 7:34 ` Michal Hocko
2016-06-08 8:19 ` Vlastimil Babka
2016-06-08 8:19 ` Vlastimil Babka
2016-06-03 7:15 ` Sergey Senozhatsky
2016-06-03 7:15 ` Sergey Senozhatsky
2016-06-03 7:25 ` Michal Hocko
2016-06-03 7:25 ` Michal Hocko
2016-06-03 8:43 ` Sergey Senozhatsky [this message]
2016-06-03 8:43 ` Sergey Senozhatsky
2016-06-03 9:55 ` Michal Hocko
2016-06-03 9:55 ` Michal Hocko
2016-06-03 10:05 ` Michal Hocko
2016-06-03 10:05 ` Michal Hocko
2016-06-03 13:38 ` Sergey Senozhatsky
2016-06-03 13:38 ` Sergey Senozhatsky
2016-06-03 13:45 ` Michal Hocko
2016-06-03 13:45 ` Michal Hocko
2016-06-03 13:49 ` Michal Hocko
2016-06-03 13:49 ` Michal Hocko
2016-06-03 13:49 ` Michal Hocko
2016-06-04 7:51 ` Sergey Senozhatsky
2016-06-04 7:51 ` Sergey Senozhatsky
2016-06-06 8:39 ` Michal Hocko
2016-06-06 8:39 ` Michal Hocko
2016-06-02 13:24 ` Vlastimil Babka
2016-06-02 13:24 ` Vlastimil Babka
2016-06-02 18:58 ` Ebru Akagunduz
2016-06-02 18:58 ` Ebru Akagunduz
2016-06-03 1:00 ` Sergey Senozhatsky
2016-06-03 1:00 ` Sergey Senozhatsky
2016-06-03 1:29 ` Sergey Senozhatsky
2016-06-03 1:29 ` Sergey Senozhatsky
2016-06-03 4:14 ` Sergey Senozhatsky
2016-06-03 4:14 ` Sergey Senozhatsky
2016-06-03 12:28 ` [PATCH] mm, thp: fix locking inconsistency in collapse_huge_page Ebru Akagunduz
2016-06-03 12:28 ` Ebru Akagunduz
2016-06-06 13:05 ` Vlastimil Babka
2016-06-06 13:05 ` Vlastimil Babka
2016-06-09 3:51 ` Sergey Senozhatsky
2016-06-09 3:51 ` Sergey Senozhatsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160603084347.GA502@swordfish \
--to=sergey.senozhatsky.work@gmail.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-next@vger.kernel.org \
--cc=mhocko@kernel.org \
--cc=sfr@canb.auug.org.au \
--cc=vbabka@suse.cz \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.