All of lore.kernel.org
 help / color / mirror / Atom feed
diff for duplicates of <20150220180218.GA4285@redhat.com>

diff --git a/a/1.txt b/N1/1.txt
index e9abda1..367a60c 100644
--- a/a/1.txt
+++ b/N1/1.txt
@@ -43,3 +43,97 @@ Thanks,
 Andrea
 
 ==
+>From aaa03f8c142c9a486e3e49de80f52d01a930ba3d Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Fri, 20 Feb 2015 18:08:57 +0100
+Subject: [PATCH] mm: incorporate zero pages into transparent huge pages fix
+
+After applying the "incorporate zero pages into transparent huge
+pages" feature, I've got an oops on a overnight stress test:
+
+------------[ cut here ]------------
+kernel BUG at mm/huge_memory.c:1920!
+invalid opcode: 0000 [#1] SMP
+Modules linked in: tun usbhid snd_hda_codec_realtek x86_pkg_temp_thermal snd_hda_codec_generic kvm_intel kvm snd_hda_intel crc32c_intel snd_hda_controller ghash_clmulni_intel xhci_pci snd_hda_codec xhci_hcd ehci_pci ehci_hcd snd_pcm usbcore psmouse sr_mod snd_timer snd cdrom pcspkr usb_common [last unloaded: microcode]
+CPU: 4 PID: 4250 Comm: Analysis Helper Not tainted 3.19.0+ #5
+Hardware name:                  /DH61BE, BIOS BEH6110H.86A.0120.2013.1112.1412 11/12/2013
+task: ffff88040c520840 ti: ffff880406070000 task.ti: ffff880406070000
+RIP: 0010:[<ffffffff811ad362>]  [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0
+RSP: 0018:ffff880406073c58  EFLAGS: 00010282
+RAX: 8000000163b2f067 RBX: ffff880406ac5f90 RCX: ffff880404f70978
+RDX: ffffea0000000000 RSI: ffff880404f70000 RDI: 8000000163b2f047
+RBP: ffff880408de25c0 R08: 00000000058ecbc0 R09: 00007f2dfe52f000
+R10: 0000000000000000 R11: 00007f2dfe600000 R12: 00007f2dfe400000
+R13: 0000000404f70067 R14: ffffc00000000fff R15: ffff8800d04da2e0
+FS:  00007f2e12168700(0000) GS:ffff88041f300000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f03b9871000 CR3: 0000000407c9d000 CR4: 00000000000427e0
+Stack:
+ffff880406073d08 00000007f2dfe400 00007f2d00000001 ffff880407f2dcd0
+ffffea00058e8000 ffff880400000000 00000000058e8000 ffffffff81b8bf40
+0000000000000004 ffffea00101ab170 ffff880408d273e8 ffff880406ac5f90
+Call Trace:
+[<ffffffff811add6a>] ? __split_huge_page_pmd+0xfa/0x2a0
+[<ffffffff811838b2>] ? unmap_single_vma+0x2b2/0x810
+[<ffffffff810c658b>] ? try_to_wake_up+0xbb/0x2d0
+[<ffffffff81104df8>] ? get_futex_key+0x1c8/0x2c0
+[<ffffffff81184969>] ? zap_page_range+0x89/0xe0
+[<ffffffff81187150>] ? handle_mm_fault+0xe70/0x1110
+[<ffffffff810cdd2e>] ? set_next_entity+0x4e/0x60
+[<ffffffff811899bc>] ? find_vma+0x5c/0x70
+[<ffffffff81195be3>] ? SyS_madvise+0x4f3/0x760
+[<ffffffff8177d192>] ? system_call_fastpath+0x12/0x17
+Code: ff ff 0f 1f 80 00 00 00 00 c7 44 24 10 00 00 00 00 e9 ef fa ff ff b8 01 00 00 00 e9 3f fb ff ff 48 83 c8 40 e9 f7 fe ff ff 0f 0b <0f> 0b 48 c7 c6 28 b8 a0 81 4c 89 f7 e8 8d 3c fd ff 0f 0b 48 8b
+RIP  [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0
+RSP <ffff880406073c58>
+---[ end trace 6ca92529e1de43ba ]---
+
+The oops happens here:
+
+			BUG_ON(!pte_none(*pte));
+
+In short when we do the split_huge_page_map we withdraw from the
+pgtable deposit of the MM and we find that pgtable isn't fully zero.
+
+That is most certainly because we didn't clear it if it was a zero
+page before putting it in the deposit. This adds the pte_clear to fix
+the bug.
+
+The PT lock could be actually not be taken, as the pte is already
+private to us and not visible to any other CPU (we'll be adding it to
+the deposit later), but because it's private the lock can't create any
+contention. Considering the paravirt calls (which also should be
+superfluous) may end up being invoked and make assumptions, I thought
+it was safer to keep the locking protocol the same, even if the
+pgtable is already private. In order to drop it however, we should
+drop it from the other path too. If we want to optimize away the lock
+from both branches, it's better to do it in a separate patch.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ mm/huge_memory.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index a63da02..f0207cf 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2205,6 +2205,18 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ 			clear_user_highpage(page, address);
+ 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
++			if (is_zero_pfn(pte_pfn(pteval))) {
++				/*
++				 * ptl mostly unnecessary.
++				 */
++				spin_lock(ptl);
++				/*
++				 * paravirt calls inside pte_clear here are
++				 * superfluous.
++				 */
++				pte_clear(vma->vm_mm, address, _pte);
++				spin_unlock(ptl);
++			}
+ 		} else {
+ 			src_page = pte_page(pteval);
+ 			copy_user_highpage(page, src_page, address, vma);
diff --git a/a/content_digest b/N1/content_digest
index a5e5d9a..101f0de 100644
--- a/a/content_digest
+++ b/N1/content_digest
@@ -62,6 +62,100 @@
  "Thanks,\n"
  "Andrea\n"
  "\n"
- ==
+ "==\n"
+ ">From aaa03f8c142c9a486e3e49de80f52d01a930ba3d Mon Sep 17 00:00:00 2001\n"
+ "From: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "Date: Fri, 20 Feb 2015 18:08:57 +0100\n"
+ "Subject: [PATCH] mm: incorporate zero pages into transparent huge pages fix\n"
+ "\n"
+ "After applying the \"incorporate zero pages into transparent huge\n"
+ "pages\" feature, I've got an oops on a overnight stress test:\n"
+ "\n"
+ "------------[ cut here ]------------\n"
+ "kernel BUG at mm/huge_memory.c:1920!\n"
+ "invalid opcode: 0000 [#1] SMP\n"
+ "Modules linked in: tun usbhid snd_hda_codec_realtek x86_pkg_temp_thermal snd_hda_codec_generic kvm_intel kvm snd_hda_intel crc32c_intel snd_hda_controller ghash_clmulni_intel xhci_pci snd_hda_codec xhci_hcd ehci_pci ehci_hcd snd_pcm usbcore psmouse sr_mod snd_timer snd cdrom pcspkr usb_common [last unloaded: microcode]\n"
+ "CPU: 4 PID: 4250 Comm: Analysis Helper Not tainted 3.19.0+ #5\n"
+ "Hardware name:                  /DH61BE, BIOS BEH6110H.86A.0120.2013.1112.1412 11/12/2013\n"
+ "task: ffff88040c520840 ti: ffff880406070000 task.ti: ffff880406070000\n"
+ "RIP: 0010:[<ffffffff811ad362>]  [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0\n"
+ "RSP: 0018:ffff880406073c58  EFLAGS: 00010282\n"
+ "RAX: 8000000163b2f067 RBX: ffff880406ac5f90 RCX: ffff880404f70978\n"
+ "RDX: ffffea0000000000 RSI: ffff880404f70000 RDI: 8000000163b2f047\n"
+ "RBP: ffff880408de25c0 R08: 00000000058ecbc0 R09: 00007f2dfe52f000\n"
+ "R10: 0000000000000000 R11: 00007f2dfe600000 R12: 00007f2dfe400000\n"
+ "R13: 0000000404f70067 R14: ffffc00000000fff R15: ffff8800d04da2e0\n"
+ "FS:  00007f2e12168700(0000) GS:ffff88041f300000(0000) knlGS:0000000000000000\n"
+ "CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033\n"
+ "CR2: 00007f03b9871000 CR3: 0000000407c9d000 CR4: 00000000000427e0\n"
+ "Stack:\n"
+ "ffff880406073d08 00000007f2dfe400 00007f2d00000001 ffff880407f2dcd0\n"
+ "ffffea00058e8000 ffff880400000000 00000000058e8000 ffffffff81b8bf40\n"
+ "0000000000000004 ffffea00101ab170 ffff880408d273e8 ffff880406ac5f90\n"
+ "Call Trace:\n"
+ "[<ffffffff811add6a>] ? __split_huge_page_pmd+0xfa/0x2a0\n"
+ "[<ffffffff811838b2>] ? unmap_single_vma+0x2b2/0x810\n"
+ "[<ffffffff810c658b>] ? try_to_wake_up+0xbb/0x2d0\n"
+ "[<ffffffff81104df8>] ? get_futex_key+0x1c8/0x2c0\n"
+ "[<ffffffff81184969>] ? zap_page_range+0x89/0xe0\n"
+ "[<ffffffff81187150>] ? handle_mm_fault+0xe70/0x1110\n"
+ "[<ffffffff810cdd2e>] ? set_next_entity+0x4e/0x60\n"
+ "[<ffffffff811899bc>] ? find_vma+0x5c/0x70\n"
+ "[<ffffffff81195be3>] ? SyS_madvise+0x4f3/0x760\n"
+ "[<ffffffff8177d192>] ? system_call_fastpath+0x12/0x17\n"
+ "Code: ff ff 0f 1f 80 00 00 00 00 c7 44 24 10 00 00 00 00 e9 ef fa ff ff b8 01 00 00 00 e9 3f fb ff ff 48 83 c8 40 e9 f7 fe ff ff 0f 0b <0f> 0b 48 c7 c6 28 b8 a0 81 4c 89 f7 e8 8d 3c fd ff 0f 0b 48 8b\n"
+ "RIP  [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0\n"
+ "RSP <ffff880406073c58>\n"
+ "---[ end trace 6ca92529e1de43ba ]---\n"
+ "\n"
+ "The oops happens here:\n"
+ "\n"
+ "\t\t\tBUG_ON(!pte_none(*pte));\n"
+ "\n"
+ "In short when we do the split_huge_page_map we withdraw from the\n"
+ "pgtable deposit of the MM and we find that pgtable isn't fully zero.\n"
+ "\n"
+ "That is most certainly because we didn't clear it if it was a zero\n"
+ "page before putting it in the deposit. This adds the pte_clear to fix\n"
+ "the bug.\n"
+ "\n"
+ "The PT lock could be actually not be taken, as the pte is already\n"
+ "private to us and not visible to any other CPU (we'll be adding it to\n"
+ "the deposit later), but because it's private the lock can't create any\n"
+ "contention. Considering the paravirt calls (which also should be\n"
+ "superfluous) may end up being invoked and make assumptions, I thought\n"
+ "it was safer to keep the locking protocol the same, even if the\n"
+ "pgtable is already private. In order to drop it however, we should\n"
+ "drop it from the other path too. If we want to optimize away the lock\n"
+ "from both branches, it's better to do it in a separate patch.\n"
+ "\n"
+ "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "---\n"
+ " mm/huge_memory.c | 12 ++++++++++++\n"
+ " 1 file changed, 12 insertions(+)\n"
+ "\n"
+ "diff --git a/mm/huge_memory.c b/mm/huge_memory.c\n"
+ "index a63da02..f0207cf 100644\n"
+ "--- a/mm/huge_memory.c\n"
+ "+++ b/mm/huge_memory.c\n"
+ "@@ -2205,6 +2205,18 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,\n"
+ " \t\tif (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {\n"
+ " \t\t\tclear_user_highpage(page, address);\n"
+ " \t\t\tadd_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);\n"
+ "+\t\t\tif (is_zero_pfn(pte_pfn(pteval))) {\n"
+ "+\t\t\t\t/*\n"
+ "+\t\t\t\t * ptl mostly unnecessary.\n"
+ "+\t\t\t\t */\n"
+ "+\t\t\t\tspin_lock(ptl);\n"
+ "+\t\t\t\t/*\n"
+ "+\t\t\t\t * paravirt calls inside pte_clear here are\n"
+ "+\t\t\t\t * superfluous.\n"
+ "+\t\t\t\t */\n"
+ "+\t\t\t\tpte_clear(vma->vm_mm, address, _pte);\n"
+ "+\t\t\t\tspin_unlock(ptl);\n"
+ "+\t\t\t}\n"
+ " \t\t} else {\n"
+ " \t\t\tsrc_page = pte_page(pteval);\n"
+ " \t\t\tcopy_user_highpage(page, src_page, address, vma);"
 
-176f453c52b9dc067c73a4d786c25b639f1c68c402d5723bb7b4b56da1a5efba
+36626b3fbbe1d60054ac3377f0d1b9a4b3c61b43c4a0e819a2a32181ce8bbd3e

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.