diff for duplicates of <20150330052250.GA3008@blaptop> diff --git a/a/1.txt b/N1/1.txt index f2a771d..0f3661e 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -1 +1,139 @@ 2nd description trial. + +>From ccfc6c79634f6cec69d8fb23b0e863ebfa5b893c Mon Sep 17 00:00:00 2001 +From: Minchan Kim <minchan@kernel.org> +Date: Mon, 30 Mar 2015 13:43:08 +0900 +Subject: [PATCH v2] mm: make every pte dirty on do_swap_page + +Bascially, MADV_FREE relys on the dirty bit in page table entry +to decide whether VM allows to discard the page or not. +IOW, if page table entry includes marked dirty bit, VM shouldn't +discard the page. + +However, if swap-in by read fault happens, page table entry +point out the page doesn't have marked dirty bit so MADV_FREE +might discard the page wrongly. For avoiding the problem, +MADV_FREE did more checks with PageDirty and PageSwapCache. +It worked out because swapped-in page lives on swap cache +and since it was evicted from the swap cache, the page has +PG_dirty flag. So both page flags checks effectvely prevent +wrong discarding by MADV_FREE. + +A problem in above logic is that swapped-in page has PG_dirty +since they are removed from swap cache so VM cannot consider +those pages as freeable any more alghouth madvise_free is +called in future. Look at below example for detail. + +ptr = malloc(); +memset(ptr); +.. +.. +.. heavy memory pressure so all of pages are swapped out +.. +.. +var = *ptr; -> a page swapped-in and removed from swapcache. + page table doesn't mark dirty bit and page + descriptor includes PG_dirty +.. +.. +madvise_free(ptr); +.. +.. +.. +.. heavy memory pressure again. +.. In this time, VM cannot discard the page because the page +.. has *PG_dirty* + +Rather than relying on the PG_dirty of page descriptor +for preventing discarding a page, dirty bit in page table is more +straightforward and simple. So, this patch makes page table dirty +bit marked whenever swap-in happens. Inherenty, page table entry +point out swapped-out page had dirty bit so I think it's no prblem. + +With this, it removes complicated logic and makes freeable page +checking by madvise_free simple. Of course, we could solve +above mentioned example. + +Cc: Hugh Dickins <hughd@google.com> +Cc: Cyrill Gorcunov <gorcunov@gmail.com> +Cc: Pavel Emelyanov <xemul@parallels.com> +Reported-by: Yalin Wang <yalin.wang@sonymobile.com> +Signed-off-by: Minchan Kim <minchan@kernel.org> +--- + +* From v1: + * Rewrite description - Andrew + + mm/madvise.c | 1 - + mm/memory.c | 10 ++++++++-- + mm/rmap.c | 2 +- + mm/vmscan.c | 3 +-- + 4 files changed, 10 insertions(+), 6 deletions(-) + +diff --git a/mm/madvise.c b/mm/madvise.c +index 22e8f0c..a045798 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -325,7 +325,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + continue; + } + +- ClearPageDirty(page); + unlock_page(page); + } + +diff --git a/mm/memory.c b/mm/memory.c +index 6743966..48ff537 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -2521,9 +2521,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + + inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); +- pte = mk_pte(page, vma->vm_page_prot); ++ ++ /* ++ * The page is swapping in now was dirty before it was swapped out ++ * so restore the state again(ie, pte_mkdirty) because MADV_FREE ++ * relies on the dirty bit on page table. ++ */ ++ pte = pte_mkdirty(mk_pte(page, vma->vm_page_prot)); + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { +- pte = maybe_mkwrite(pte_mkdirty(pte), vma); ++ pte = maybe_mkwrite(pte, vma); + flags &= ~FAULT_FLAG_WRITE; + ret |= VM_FAULT_WRITE; + exclusive = 1; +diff --git a/mm/rmap.c b/mm/rmap.c +index dad23a4..281e806 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -1275,7 +1275,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + + if (flags & TTU_FREE) { + VM_BUG_ON_PAGE(PageSwapCache(page), page); +- if (!dirty && !PageDirty(page)) { ++ if (!dirty) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; +diff --git a/mm/vmscan.c b/mm/vmscan.c +index dc6cd51..fffebf0 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -805,8 +805,7 @@ static enum page_references page_check_references(struct page *page, + return PAGEREF_KEEP; + } + +- if (PageAnon(page) && !pte_dirty && !PageSwapCache(page) && +- !PageDirty(page)) ++ if (PageAnon(page) && !pte_dirty && !PageSwapCache(page)) + *freeable = true; + + /* Reclaim if clean, defer dirty pages to writeback */ +-- +1.9.3 + +-- +Kind regards, +Minchan Kim diff --git a/a/content_digest b/N1/content_digest index d467f2c..d4176af 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -17,6 +17,144 @@ " Pavel Emelyanov <xemul@parallels.com>\0" "\00:1\0" "b\0" - 2nd description trial. + "2nd description trial.\n" + "\n" + ">From ccfc6c79634f6cec69d8fb23b0e863ebfa5b893c Mon Sep 17 00:00:00 2001\n" + "From: Minchan Kim <minchan@kernel.org>\n" + "Date: Mon, 30 Mar 2015 13:43:08 +0900\n" + "Subject: [PATCH v2] mm: make every pte dirty on do_swap_page\n" + "\n" + "Bascially, MADV_FREE relys on the dirty bit in page table entry\n" + "to decide whether VM allows to discard the page or not.\n" + "IOW, if page table entry includes marked dirty bit, VM shouldn't\n" + "discard the page.\n" + "\n" + "However, if swap-in by read fault happens, page table entry\n" + "point out the page doesn't have marked dirty bit so MADV_FREE\n" + "might discard the page wrongly. For avoiding the problem,\n" + "MADV_FREE did more checks with PageDirty and PageSwapCache.\n" + "It worked out because swapped-in page lives on swap cache\n" + "and since it was evicted from the swap cache, the page has\n" + "PG_dirty flag. So both page flags checks effectvely prevent\n" + "wrong discarding by MADV_FREE.\n" + "\n" + "A problem in above logic is that swapped-in page has PG_dirty\n" + "since they are removed from swap cache so VM cannot consider\n" + "those pages as freeable any more alghouth madvise_free is\n" + "called in future. Look at below example for detail.\n" + "\n" + "ptr = malloc();\n" + "memset(ptr);\n" + "..\n" + "..\n" + ".. heavy memory pressure so all of pages are swapped out\n" + "..\n" + "..\n" + "var = *ptr; -> a page swapped-in and removed from swapcache.\n" + " page table doesn't mark dirty bit and page\n" + " descriptor includes PG_dirty\n" + "..\n" + "..\n" + "madvise_free(ptr);\n" + "..\n" + "..\n" + "..\n" + ".. heavy memory pressure again.\n" + ".. In this time, VM cannot discard the page because the page\n" + ".. has *PG_dirty*\n" + "\n" + "Rather than relying on the PG_dirty of page descriptor\n" + "for preventing discarding a page, dirty bit in page table is more\n" + "straightforward and simple. So, this patch makes page table dirty\n" + "bit marked whenever swap-in happens. Inherenty, page table entry\n" + "point out swapped-out page had dirty bit so I think it's no prblem.\n" + "\n" + "With this, it removes complicated logic and makes freeable page\n" + "checking by madvise_free simple. Of course, we could solve\n" + "above mentioned example.\n" + "\n" + "Cc: Hugh Dickins <hughd@google.com>\n" + "Cc: Cyrill Gorcunov <gorcunov@gmail.com>\n" + "Cc: Pavel Emelyanov <xemul@parallels.com>\n" + "Reported-by: Yalin Wang <yalin.wang@sonymobile.com>\n" + "Signed-off-by: Minchan Kim <minchan@kernel.org>\n" + "---\n" + "\n" + "* From v1:\n" + " * Rewrite description - Andrew\n" + "\n" + " mm/madvise.c | 1 -\n" + " mm/memory.c | 10 ++++++++--\n" + " mm/rmap.c | 2 +-\n" + " mm/vmscan.c | 3 +--\n" + " 4 files changed, 10 insertions(+), 6 deletions(-)\n" + "\n" + "diff --git a/mm/madvise.c b/mm/madvise.c\n" + "index 22e8f0c..a045798 100644\n" + "--- a/mm/madvise.c\n" + "+++ b/mm/madvise.c\n" + "@@ -325,7 +325,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,\n" + " \t\t\t\tcontinue;\n" + " \t\t\t}\n" + " \n" + "-\t\t\tClearPageDirty(page);\n" + " \t\t\tunlock_page(page);\n" + " \t\t}\n" + " \n" + "diff --git a/mm/memory.c b/mm/memory.c\n" + "index 6743966..48ff537 100644\n" + "--- a/mm/memory.c\n" + "+++ b/mm/memory.c\n" + "@@ -2521,9 +2521,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,\n" + " \n" + " \tinc_mm_counter_fast(mm, MM_ANONPAGES);\n" + " \tdec_mm_counter_fast(mm, MM_SWAPENTS);\n" + "-\tpte = mk_pte(page, vma->vm_page_prot);\n" + "+\n" + "+\t/*\n" + "+\t * The page is swapping in now was dirty before it was swapped out\n" + "+\t * so restore the state again(ie, pte_mkdirty) because MADV_FREE\n" + "+\t * relies on the dirty bit on page table.\n" + "+\t */\n" + "+\tpte = pte_mkdirty(mk_pte(page, vma->vm_page_prot));\n" + " \tif ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {\n" + "-\t\tpte = maybe_mkwrite(pte_mkdirty(pte), vma);\n" + "+\t\tpte = maybe_mkwrite(pte, vma);\n" + " \t\tflags &= ~FAULT_FLAG_WRITE;\n" + " \t\tret |= VM_FAULT_WRITE;\n" + " \t\texclusive = 1;\n" + "diff --git a/mm/rmap.c b/mm/rmap.c\n" + "index dad23a4..281e806 100644\n" + "--- a/mm/rmap.c\n" + "+++ b/mm/rmap.c\n" + "@@ -1275,7 +1275,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,\n" + " \n" + " \t\tif (flags & TTU_FREE) {\n" + " \t\t\tVM_BUG_ON_PAGE(PageSwapCache(page), page);\n" + "-\t\t\tif (!dirty && !PageDirty(page)) {\n" + "+\t\t\tif (!dirty) {\n" + " \t\t\t\t/* It's a freeable page by MADV_FREE */\n" + " \t\t\t\tdec_mm_counter(mm, MM_ANONPAGES);\n" + " \t\t\t\tgoto discard;\n" + "diff --git a/mm/vmscan.c b/mm/vmscan.c\n" + "index dc6cd51..fffebf0 100644\n" + "--- a/mm/vmscan.c\n" + "+++ b/mm/vmscan.c\n" + "@@ -805,8 +805,7 @@ static enum page_references page_check_references(struct page *page,\n" + " \t\treturn PAGEREF_KEEP;\n" + " \t}\n" + " \n" + "-\tif (PageAnon(page) && !pte_dirty && !PageSwapCache(page) &&\n" + "-\t\t\t!PageDirty(page))\n" + "+\tif (PageAnon(page) && !pte_dirty && !PageSwapCache(page))\n" + " \t\t*freeable = true;\n" + " \n" + " \t/* Reclaim if clean, defer dirty pages to writeback */\n" + "-- \n" + "1.9.3\n" + "\n" + "-- \n" + "Kind regards,\n" + Minchan Kim -f85df9b8c917e4c1ecd47eb5a5d8eff8db09ef28134331be085bd261cff0bc0f +19e804f2addba4d10366e53de9f7b9cd0dd344bac872e2724a105153bcc4b4cc
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.