diff for duplicates of <20150220180218.GA4285@redhat.com> diff --git a/a/1.txt b/N1/1.txt index e9abda1..367a60c 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -43,3 +43,97 @@ Thanks, Andrea == +>From aaa03f8c142c9a486e3e49de80f52d01a930ba3d Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli <aarcange@redhat.com> +Date: Fri, 20 Feb 2015 18:08:57 +0100 +Subject: [PATCH] mm: incorporate zero pages into transparent huge pages fix + +After applying the "incorporate zero pages into transparent huge +pages" feature, I've got an oops on a overnight stress test: + +------------[ cut here ]------------ +kernel BUG at mm/huge_memory.c:1920! +invalid opcode: 0000 [#1] SMP +Modules linked in: tun usbhid snd_hda_codec_realtek x86_pkg_temp_thermal snd_hda_codec_generic kvm_intel kvm snd_hda_intel crc32c_intel snd_hda_controller ghash_clmulni_intel xhci_pci snd_hda_codec xhci_hcd ehci_pci ehci_hcd snd_pcm usbcore psmouse sr_mod snd_timer snd cdrom pcspkr usb_common [last unloaded: microcode] +CPU: 4 PID: 4250 Comm: Analysis Helper Not tainted 3.19.0+ #5 +Hardware name: /DH61BE, BIOS BEH6110H.86A.0120.2013.1112.1412 11/12/2013 +task: ffff88040c520840 ti: ffff880406070000 task.ti: ffff880406070000 +RIP: 0010:[<ffffffff811ad362>] [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0 +RSP: 0018:ffff880406073c58 EFLAGS: 00010282 +RAX: 8000000163b2f067 RBX: ffff880406ac5f90 RCX: ffff880404f70978 +RDX: ffffea0000000000 RSI: ffff880404f70000 RDI: 8000000163b2f047 +RBP: ffff880408de25c0 R08: 00000000058ecbc0 R09: 00007f2dfe52f000 +R10: 0000000000000000 R11: 00007f2dfe600000 R12: 00007f2dfe400000 +R13: 0000000404f70067 R14: ffffc00000000fff R15: ffff8800d04da2e0 +FS: 00007f2e12168700(0000) GS:ffff88041f300000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f03b9871000 CR3: 0000000407c9d000 CR4: 00000000000427e0 +Stack: +ffff880406073d08 00000007f2dfe400 00007f2d00000001 ffff880407f2dcd0 +ffffea00058e8000 ffff880400000000 00000000058e8000 ffffffff81b8bf40 +0000000000000004 ffffea00101ab170 ffff880408d273e8 ffff880406ac5f90 +Call Trace: +[<ffffffff811add6a>] ? __split_huge_page_pmd+0xfa/0x2a0 +[<ffffffff811838b2>] ? unmap_single_vma+0x2b2/0x810 +[<ffffffff810c658b>] ? try_to_wake_up+0xbb/0x2d0 +[<ffffffff81104df8>] ? get_futex_key+0x1c8/0x2c0 +[<ffffffff81184969>] ? zap_page_range+0x89/0xe0 +[<ffffffff81187150>] ? handle_mm_fault+0xe70/0x1110 +[<ffffffff810cdd2e>] ? set_next_entity+0x4e/0x60 +[<ffffffff811899bc>] ? find_vma+0x5c/0x70 +[<ffffffff81195be3>] ? SyS_madvise+0x4f3/0x760 +[<ffffffff8177d192>] ? system_call_fastpath+0x12/0x17 +Code: ff ff 0f 1f 80 00 00 00 00 c7 44 24 10 00 00 00 00 e9 ef fa ff ff b8 01 00 00 00 e9 3f fb ff ff 48 83 c8 40 e9 f7 fe ff ff 0f 0b <0f> 0b 48 c7 c6 28 b8 a0 81 4c 89 f7 e8 8d 3c fd ff 0f 0b 48 8b +RIP [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0 +RSP <ffff880406073c58> +---[ end trace 6ca92529e1de43ba ]--- + +The oops happens here: + + BUG_ON(!pte_none(*pte)); + +In short when we do the split_huge_page_map we withdraw from the +pgtable deposit of the MM and we find that pgtable isn't fully zero. + +That is most certainly because we didn't clear it if it was a zero +page before putting it in the deposit. This adds the pte_clear to fix +the bug. + +The PT lock could be actually not be taken, as the pte is already +private to us and not visible to any other CPU (we'll be adding it to +the deposit later), but because it's private the lock can't create any +contention. Considering the paravirt calls (which also should be +superfluous) may end up being invoked and make assumptions, I thought +it was safer to keep the locking protocol the same, even if the +pgtable is already private. In order to drop it however, we should +drop it from the other path too. If we want to optimize away the lock +from both branches, it's better to do it in a separate patch. + +Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> +--- + mm/huge_memory.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index a63da02..f0207cf 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2205,6 +2205,18 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); ++ if (is_zero_pfn(pte_pfn(pteval))) { ++ /* ++ * ptl mostly unnecessary. ++ */ ++ spin_lock(ptl); ++ /* ++ * paravirt calls inside pte_clear here are ++ * superfluous. ++ */ ++ pte_clear(vma->vm_mm, address, _pte); ++ spin_unlock(ptl); ++ } + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); diff --git a/a/content_digest b/N1/content_digest index a5e5d9a..101f0de 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -62,6 +62,100 @@ "Thanks,\n" "Andrea\n" "\n" - == + "==\n" + ">From aaa03f8c142c9a486e3e49de80f52d01a930ba3d Mon Sep 17 00:00:00 2001\n" + "From: Andrea Arcangeli <aarcange@redhat.com>\n" + "Date: Fri, 20 Feb 2015 18:08:57 +0100\n" + "Subject: [PATCH] mm: incorporate zero pages into transparent huge pages fix\n" + "\n" + "After applying the \"incorporate zero pages into transparent huge\n" + "pages\" feature, I've got an oops on a overnight stress test:\n" + "\n" + "------------[ cut here ]------------\n" + "kernel BUG at mm/huge_memory.c:1920!\n" + "invalid opcode: 0000 [#1] SMP\n" + "Modules linked in: tun usbhid snd_hda_codec_realtek x86_pkg_temp_thermal snd_hda_codec_generic kvm_intel kvm snd_hda_intel crc32c_intel snd_hda_controller ghash_clmulni_intel xhci_pci snd_hda_codec xhci_hcd ehci_pci ehci_hcd snd_pcm usbcore psmouse sr_mod snd_timer snd cdrom pcspkr usb_common [last unloaded: microcode]\n" + "CPU: 4 PID: 4250 Comm: Analysis Helper Not tainted 3.19.0+ #5\n" + "Hardware name: /DH61BE, BIOS BEH6110H.86A.0120.2013.1112.1412 11/12/2013\n" + "task: ffff88040c520840 ti: ffff880406070000 task.ti: ffff880406070000\n" + "RIP: 0010:[<ffffffff811ad362>] [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0\n" + "RSP: 0018:ffff880406073c58 EFLAGS: 00010282\n" + "RAX: 8000000163b2f067 RBX: ffff880406ac5f90 RCX: ffff880404f70978\n" + "RDX: ffffea0000000000 RSI: ffff880404f70000 RDI: 8000000163b2f047\n" + "RBP: ffff880408de25c0 R08: 00000000058ecbc0 R09: 00007f2dfe52f000\n" + "R10: 0000000000000000 R11: 00007f2dfe600000 R12: 00007f2dfe400000\n" + "R13: 0000000404f70067 R14: ffffc00000000fff R15: ffff8800d04da2e0\n" + "FS: 00007f2e12168700(0000) GS:ffff88041f300000(0000) knlGS:0000000000000000\n" + "CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033\n" + "CR2: 00007f03b9871000 CR3: 0000000407c9d000 CR4: 00000000000427e0\n" + "Stack:\n" + "ffff880406073d08 00000007f2dfe400 00007f2d00000001 ffff880407f2dcd0\n" + "ffffea00058e8000 ffff880400000000 00000000058e8000 ffffffff81b8bf40\n" + "0000000000000004 ffffea00101ab170 ffff880408d273e8 ffff880406ac5f90\n" + "Call Trace:\n" + "[<ffffffff811add6a>] ? __split_huge_page_pmd+0xfa/0x2a0\n" + "[<ffffffff811838b2>] ? unmap_single_vma+0x2b2/0x810\n" + "[<ffffffff810c658b>] ? try_to_wake_up+0xbb/0x2d0\n" + "[<ffffffff81104df8>] ? get_futex_key+0x1c8/0x2c0\n" + "[<ffffffff81184969>] ? zap_page_range+0x89/0xe0\n" + "[<ffffffff81187150>] ? handle_mm_fault+0xe70/0x1110\n" + "[<ffffffff810cdd2e>] ? set_next_entity+0x4e/0x60\n" + "[<ffffffff811899bc>] ? find_vma+0x5c/0x70\n" + "[<ffffffff81195be3>] ? SyS_madvise+0x4f3/0x760\n" + "[<ffffffff8177d192>] ? system_call_fastpath+0x12/0x17\n" + "Code: ff ff 0f 1f 80 00 00 00 00 c7 44 24 10 00 00 00 00 e9 ef fa ff ff b8 01 00 00 00 e9 3f fb ff ff 48 83 c8 40 e9 f7 fe ff ff 0f 0b <0f> 0b 48 c7 c6 28 b8 a0 81 4c 89 f7 e8 8d 3c fd ff 0f 0b 48 8b\n" + "RIP [<ffffffff811ad362>] split_huge_page_to_list+0x6a2/0x7c0\n" + "RSP <ffff880406073c58>\n" + "---[ end trace 6ca92529e1de43ba ]---\n" + "\n" + "The oops happens here:\n" + "\n" + "\t\t\tBUG_ON(!pte_none(*pte));\n" + "\n" + "In short when we do the split_huge_page_map we withdraw from the\n" + "pgtable deposit of the MM and we find that pgtable isn't fully zero.\n" + "\n" + "That is most certainly because we didn't clear it if it was a zero\n" + "page before putting it in the deposit. This adds the pte_clear to fix\n" + "the bug.\n" + "\n" + "The PT lock could be actually not be taken, as the pte is already\n" + "private to us and not visible to any other CPU (we'll be adding it to\n" + "the deposit later), but because it's private the lock can't create any\n" + "contention. Considering the paravirt calls (which also should be\n" + "superfluous) may end up being invoked and make assumptions, I thought\n" + "it was safer to keep the locking protocol the same, even if the\n" + "pgtable is already private. In order to drop it however, we should\n" + "drop it from the other path too. If we want to optimize away the lock\n" + "from both branches, it's better to do it in a separate patch.\n" + "\n" + "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n" + "---\n" + " mm/huge_memory.c | 12 ++++++++++++\n" + " 1 file changed, 12 insertions(+)\n" + "\n" + "diff --git a/mm/huge_memory.c b/mm/huge_memory.c\n" + "index a63da02..f0207cf 100644\n" + "--- a/mm/huge_memory.c\n" + "+++ b/mm/huge_memory.c\n" + "@@ -2205,6 +2205,18 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,\n" + " \t\tif (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {\n" + " \t\t\tclear_user_highpage(page, address);\n" + " \t\t\tadd_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);\n" + "+\t\t\tif (is_zero_pfn(pte_pfn(pteval))) {\n" + "+\t\t\t\t/*\n" + "+\t\t\t\t * ptl mostly unnecessary.\n" + "+\t\t\t\t */\n" + "+\t\t\t\tspin_lock(ptl);\n" + "+\t\t\t\t/*\n" + "+\t\t\t\t * paravirt calls inside pte_clear here are\n" + "+\t\t\t\t * superfluous.\n" + "+\t\t\t\t */\n" + "+\t\t\t\tpte_clear(vma->vm_mm, address, _pte);\n" + "+\t\t\t\tspin_unlock(ptl);\n" + "+\t\t\t}\n" + " \t\t} else {\n" + " \t\t\tsrc_page = pte_page(pteval);\n" + " \t\t\tcopy_user_highpage(page, src_page, address, vma);" -176f453c52b9dc067c73a4d786c25b639f1c68c402d5723bb7b4b56da1a5efba +36626b3fbbe1d60054ac3377f0d1b9a4b3c61b43c4a0e819a2a32181ce8bbd3e
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.