From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx186.postini.com [74.125.245.186]) by kanga.kvack.org (Postfix) with SMTP id 236056B0032 for ; Tue, 30 Jul 2013 16:46:58 -0400 (EDT) Received: by mail-la0-f52.google.com with SMTP id fq13so4473619lab.25 for ; Tue, 30 Jul 2013 13:46:56 -0700 (PDT) Message-Id: <20130730204154.407090410@gmail.com> Date: Wed, 31 Jul 2013 00:41:54 +0400 From: Cyrill Gorcunov Subject: [patch 0/2] Soft-dirty page tracker improvemens Sender: owner-linux-mm@kvack.org List-ID: To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Hi, as being reported by Andy, there are a couple of situations when soft-dirty bit will be lost, in paricular when page we're tracking is going to swap and when file page get reclaimed. In this series both problems are aimed. One more hardness which remains is the scenario when vma area (which has soft-dirty bit set in appropriate pte entries) get unmapped then new one mapped in-place. I'm working on it now hope to provide a patch soon. Thanks, Cyrill -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx135.postini.com [74.125.245.135]) by kanga.kvack.org (Postfix) with SMTP id E3ADD6B0032 for ; Tue, 30 Jul 2013 16:46:58 -0400 (EDT) Received: by mail-la0-f52.google.com with SMTP id fq13so4473629lab.25 for ; Tue, 30 Jul 2013 13:46:57 -0700 (PDT) Message-Id: <20130730204654.966378702@gmail.com> Date: Wed, 31 Jul 2013 00:41:56 +0400 From: Cyrill Gorcunov Subject: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> Content-Disposition: inline; filename=pte-sft-dirty-file-2 Sender: owner-linux-mm@kvack.org List-ID: To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Andy reported that if file page get reclaimed we loose soft-dirty bit if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address get encoded into pte entry. Thus when #pf happens on such non-present pte we can restore it back. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andrew Morton Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" --- arch/x86/include/asm/pgtable-2level.h | 48 +++++++++++++++++++++++++++++++++- arch/x86/include/asm/pgtable-3level.h | 3 ++ arch/x86/include/asm/pgtable.h | 15 ++++++++++ arch/x86/include/asm/pgtable_types.h | 4 ++ fs/proc/task_mmu.c | 2 + include/asm-generic/pgtable.h | 15 ++++++++++ mm/fremap.c | 11 +++++-- mm/memory.c | 11 +++++-- mm/rmap.c | 8 ++++- 9 files changed, 107 insertions(+), 10 deletions(-) Index: linux-2.6.git/arch/x86/include/asm/pgtable-2level.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable-2level.h +++ linux-2.6.git/arch/x86/include/asm/pgtable-2level.h @@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset + * into this range. + */ +#define PTE_FILE_MAX_BITS 28 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) +#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ + & ((1U << PTE_FILE_BITS1) - 1))) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << (PTE_FILE_BITS1)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << PTE_FILE_SHIFT3) \ + + ((((off) >> \ + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ + << PTE_FILE_SHIFT4) \ + + _PAGE_FILE }) + +#else /* CONFIG_MEM_SOFT_DIRTY */ + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range: + * split up the 29 bits of offset into this range. */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) @@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_ << PTE_FILE_SHIFT3) \ + _PAGE_FILE }) +#endif /* CONFIG_MEM_SOFT_DIRTY */ + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) Index: linux-2.6.git/arch/x86/include/asm/pgtable-3level.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable-3level.h +++ linux-2.6.git/arch/x86/include/asm/pgtable-3level.h @@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_ /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. + * + * For soft-dirty tracking 11 bit is taken from + * the low part of pte as well. */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) \ Index: linux-2.6.git/arch/x86/include/asm/pgtable.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h +++ linux-2.6.git/arch/x86/include/asm/pgtable.h @@ -329,6 +329,21 @@ static inline pte_t pte_swp_clear_soft_d return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h +++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h @@ -61,8 +61,10 @@ * they do not conflict with each other. */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN + #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif Index: linux-2.6.git/fs/proc/task_mmu.c =================================================================== --- linux-2.6.git.orig/fs/proc/task_mmu.c +++ linux-2.6.git/fs/proc/task_mmu.c @@ -736,6 +736,8 @@ static inline void clear_soft_dirty(stru ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); } set_pte_at(vma->vm_mm, addr, pte, ptent); Index: linux-2.6.git/include/asm-generic/pgtable.h =================================================================== --- linux-2.6.git.orig/include/asm-generic/pgtable.h +++ linux-2.6.git/include/asm-generic/pgtable.h @@ -432,6 +432,21 @@ static inline pte_t pte_swp_clear_soft_d { return pte; } + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return 0; +} #endif #ifndef __HAVE_PFNMAP_TRACKING Index: linux-2.6.git/mm/fremap.c =================================================================== --- linux-2.6.git.orig/mm/fremap.c +++ linux-2.6.git/mm/fremap.c @@ -57,17 +57,22 @@ static int install_file_pte(struct mm_st unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; - pte_t *pte; + pte_t *pte, ptfile; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; - if (!pte_none(*pte)) + ptfile = pgoff_to_pte(pgoff); + + if (!pte_none(*pte)) { + if (pte_present(*pte) && pte_soft_dirty(*pte)) + pte_file_mksoft_dirty(ptfile); zap_pte(mm, vma, addr, pte); + } - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + set_pte_at(mm, addr, pte, ptfile); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a Index: linux-2.6.git/mm/memory.c =================================================================== --- linux-2.6.git.orig/mm/memory.c +++ linux-2.6.git/mm/memory.c @@ -1141,9 +1141,12 @@ again: continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + addr) != page->index) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(ptent)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, addr, pte, ptfile); + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -3410,6 +3413,8 @@ static int __do_fault(struct mm_struct * entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) + pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); Index: linux-2.6.git/mm/rmap.c =================================================================== --- linux-2.6.git.orig/mm/rmap.c +++ linux-2.6.git/mm/rmap.c @@ -1405,8 +1405,12 @@ static int try_to_unmap_cluster(unsigned pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + if (page->index != linear_page_index(vma, address)) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(pteval)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, address, pte, ptfile); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx172.postini.com [74.125.245.172]) by kanga.kvack.org (Postfix) with SMTP id 951766B0034 for ; Tue, 30 Jul 2013 16:46:58 -0400 (EDT) Received: by mail-lb0-f175.google.com with SMTP id 13so7403lba.6 for ; Tue, 30 Jul 2013 13:46:56 -0700 (PDT) Message-Id: <20130730204654.844299768@gmail.com> Date: Wed, 31 Jul 2013 00:41:55 +0400 From: Cyrill Gorcunov Subject: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages References: <20130730204154.407090410@gmail.com> Content-Disposition: inline; filename=pte-sft-dirty-swap-4 Sender: owner-linux-mm@kvack.org List-ID: To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY bit set get swapped out, the bit is getting lost and no longer available when pte read back. To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is saved in pte entry for the page being swapped out. When such page is to be read back from a swap cache we check for bit presence and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY bit back. One of the problem was to find a place in pte entry where we can save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The _PAGE_PSE was chosen for that, it doesn't intersect with swap entry format stored in pte. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andrew Morton Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" --- arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ fs/proc/task_mmu.c | 21 +++++++++++++++------ include/asm-generic/pgtable.h | 15 +++++++++++++++ include/linux/swapops.h | 2 ++ mm/memory.c | 2 ++ mm/rmap.c | 6 +++++- mm/swapfile.c | 19 +++++++++++++++++-- 8 files changed, 84 insertions(+), 9 deletions(-) Index: linux-2.6.git/arch/x86/include/asm/pgtable.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h +++ linux-2.6.git/arch/x86/include/asm/pgtable.h @@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h +++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h @@ -67,6 +67,19 @@ #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict + * with swap entry format. On x86 bits 6 and 7 are *not* involved + * into swap entry computation, but bit 6 is used for nonlinear + * file mapping, so we borrow bit 7 for soft dirty tracking. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#else +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else Index: linux-2.6.git/fs/proc/task_mmu.c =================================================================== --- linux-2.6.git.orig/fs/proc/task_mmu.c +++ linux-2.6.git/fs/proc/task_mmu.c @@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru * of how soft-dirty works. */ pte_t ptent = *pte; - ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; - if (!pte_present(ptent)) - continue; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } + if (!pte_present(ptent)) + continue; + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); } else if (is_swap_pte(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags = PM_SWAP; Index: linux-2.6.git/include/asm-generic/pgtable.h =================================================================== --- linux-2.6.git.orig/include/asm-generic/pgtable.h +++ linux-2.6.git/include/asm-generic/pgtable.h @@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd { return pmd; } + +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte; +} #endif #ifndef __HAVE_PFNMAP_TRACKING Index: linux-2.6.git/include/linux/swapops.h =================================================================== --- linux-2.6.git.orig/include/linux/swapops.h +++ linux-2.6.git/include/linux/swapops.h @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent swp_entry_t arch_entry; BUG_ON(pte_file(pte)); + if (pte_swp_soft_dirty(pte)) + pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } Index: linux-2.6.git/mm/memory.c =================================================================== --- linux-2.6.git.orig/mm/memory.c +++ linux-2.6.git/mm/memory.c @@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct exclusive = 1; } flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); if (page == swapcache) do_page_add_anon_rmap(page, vma, address, exclusive); Index: linux-2.6.git/mm/rmap.c =================================================================== --- linux-2.6.git.orig/mm/rmap.c +++ linux-2.6.git/mm/rmap.c @@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; + pte_t swp_pte; if (PageSwapCache(page)) { /* @@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (TTU_ACTION(flags) == TTU_MIGRATION)) { Index: linux-2.6.git/mm/swapfile.c =================================================================== --- linux-2.6.git.orig/mm/swapfile.c +++ linux-2.6.git/mm/swapfile.c @@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, } #endif /* CONFIG_HIBERNATION */ +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; @@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(pte_same(*pte, swp_pte))) { + if (unlikely(maybe_same_pte(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx163.postini.com [74.125.245.163]) by kanga.kvack.org (Postfix) with SMTP id DB62E6B0031 for ; Wed, 31 Jul 2013 04:16:37 -0400 (EDT) Message-ID: <51F8C7CC.6010703@parallels.com> Date: Wed, 31 Jul 2013 12:16:12 +0400 From: Pavel Emelyanov MIME-Version: 1.0 Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> In-Reply-To: <20130730204654.844299768@gmail.com> Content-Type: text/plain; charset="ISO-8859-1" Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov , akpm@linux-foundation.org Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On 07/31/2013 12:41 AM, Cyrill Gorcunov wrote: > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > bit set get swapped out, the bit is getting lost and no longer > available when pte read back. > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > saved in pte entry for the page being swapped out. When such page > is to be read back from a swap cache we check for bit presence > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > bit back. > > One of the problem was to find a place in pte entry where we can > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > _PAGE_PSE was chosen for that, it doesn't intersect with swap > entry format stored in pte. > > Reported-by: Andy Lutomirski > Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx162.postini.com [74.125.245.162]) by kanga.kvack.org (Postfix) with SMTP id 3E7196B0032 for ; Wed, 31 Jul 2013 04:17:05 -0400 (EDT) Message-ID: <51F8C7F4.9020504@parallels.com> Date: Wed, 31 Jul 2013 12:16:52 +0400 From: Pavel Emelyanov MIME-Version: 1.0 Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> In-Reply-To: <20130730204654.966378702@gmail.com> Content-Type: text/plain; charset="ISO-8859-1" Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov , akpm@linux-foundation.org Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On 07/31/2013 12:41 AM, Cyrill Gorcunov wrote: > Andy reported that if file page get reclaimed we loose soft-dirty bit > if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address > get encoded into pte entry. Thus when #pf happens on such non-present > pte we can restore it back. > > Reported-by: Andy Lutomirski > Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx160.postini.com [74.125.245.160]) by kanga.kvack.org (Postfix) with SMTP id BFCD56B0031 for ; Wed, 31 Jul 2013 20:51:06 -0400 (EDT) Date: Thu, 1 Aug 2013 09:51:32 +0900 From: Minchan Kim Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801005132.GB19540@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130730204654.844299768@gmail.com> Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Hello, On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > bit set get swapped out, the bit is getting lost and no longer > available when pte read back. > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > saved in pte entry for the page being swapped out. When such page > is to be read back from a swap cache we check for bit presence > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > bit back. > > One of the problem was to find a place in pte entry where we can > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > _PAGE_PSE was chosen for that, it doesn't intersect with swap > entry format stored in pte. > > Reported-by: Andy Lutomirski > Signed-off-by: Cyrill Gorcunov > Cc: Pavel Emelyanov > Cc: Andrew Morton > Cc: Matt Mackall > Cc: Xiao Guangrong > Cc: Marcelo Tosatti > Cc: KOSAKI Motohiro > Cc: Stephen Rothwell > Cc: Peter Zijlstra > Cc: "Aneesh Kumar K.V" > --- > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > fs/proc/task_mmu.c | 21 +++++++++++++++------ > include/asm-generic/pgtable.h | 15 +++++++++++++++ > include/linux/swapops.h | 2 ++ > mm/memory.c | 2 ++ > mm/rmap.c | 6 +++++- > mm/swapfile.c | 19 +++++++++++++++++-- > 8 files changed, 84 insertions(+), 9 deletions(-) > > Index: linux-2.6.git/arch/x86/include/asm/pgtable.h > =================================================================== > --- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h > +++ linux-2.6.git/arch/x86/include/asm/pgtable.h > @@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > } > > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > +{ > + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); > +} > + > +static inline int pte_swp_soft_dirty(pte_t pte) > +{ > + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; > +} > + > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > +{ > + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); > +} > + > /* > * Mask out unsupported bits in a present pgprot. Non-present pgprots > * can use those bits for other purposes, so leave them be. > Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h > =================================================================== > --- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h > +++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h > @@ -67,6 +67,19 @@ > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > #endif > > +/* > + * Tracking soft dirty bit when a page goes to a swap is tricky. > + * We need a bit which can be stored in pte _and_ not conflict > + * with swap entry format. On x86 bits 6 and 7 are *not* involved > + * into swap entry computation, but bit 6 is used for nonlinear > + * file mapping, so we borrow bit 7 for soft dirty tracking. > + */ > +#ifdef CONFIG_MEM_SOFT_DIRTY > +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE > +#else > +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) > +#endif > + > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > #else > Index: linux-2.6.git/fs/proc/task_mmu.c > =================================================================== > --- linux-2.6.git.orig/fs/proc/task_mmu.c > +++ linux-2.6.git/fs/proc/task_mmu.c > @@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > * of how soft-dirty works. > */ > pte_t ptent = *pte; > - ptent = pte_wrprotect(ptent); > - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > + > + if (pte_present(ptent)) { > + ptent = pte_wrprotect(ptent); > + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > + } else if (is_swap_pte(ptent)) { > + ptent = pte_swp_clear_soft_dirty(ptent); > + } > + > set_pte_at(vma->vm_mm, addr, pte, ptent); > #endif > } > @@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > for (; addr != end; pte++, addr += PAGE_SIZE) { > ptent = *pte; > - if (!pte_present(ptent)) > - continue; > > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > clear_soft_dirty(vma, addr, pte); > continue; > } > > + if (!pte_present(ptent)) > + continue; > + > page = vm_normal_page(vma, addr, ptent); > if (!page) > continue; > @@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > flags = PM_PRESENT; > page = vm_normal_page(vma, addr, pte); > } else if (is_swap_pte(pte)) { > - swp_entry_t entry = pte_to_swp_entry(pte); > - > + swp_entry_t entry; > + if (pte_swp_soft_dirty(pte)) > + flags2 |= __PM_SOFT_DIRTY; > + entry = pte_to_swp_entry(pte); > frame = swp_type(entry) | > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > flags = PM_SWAP; > Index: linux-2.6.git/include/asm-generic/pgtable.h > =================================================================== > --- linux-2.6.git.orig/include/asm-generic/pgtable.h > +++ linux-2.6.git/include/asm-generic/pgtable.h > @@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > { > return pmd; > } > + > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > +{ > + return pte; > +} > + > +static inline int pte_swp_soft_dirty(pte_t pte) > +{ > + return 0; > +} > + > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > +{ > + return pte; > +} > #endif > > #ifndef __HAVE_PFNMAP_TRACKING > Index: linux-2.6.git/include/linux/swapops.h > =================================================================== > --- linux-2.6.git.orig/include/linux/swapops.h > +++ linux-2.6.git/include/linux/swapops.h > @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > swp_entry_t arch_entry; > > BUG_ON(pte_file(pte)); > + if (pte_swp_soft_dirty(pte)) > + pte = pte_swp_clear_soft_dirty(pte); Why do you remove soft-dirty flag whenever pte_to_swp_entry is called? Isn't there any problem if we use mincore? > arch_entry = __pte_to_swp_entry(pte); > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > } > Index: linux-2.6.git/mm/memory.c > =================================================================== > --- linux-2.6.git.orig/mm/memory.c > +++ linux-2.6.git/mm/memory.c > @@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > exclusive = 1; > } > flush_icache_page(vma, page); > + if (pte_swp_soft_dirty(orig_pte)) > + pte = pte_mksoft_dirty(pte); > set_pte_at(mm, address, page_table, pte); > if (page == swapcache) > do_page_add_anon_rmap(page, vma, address, exclusive); > Index: linux-2.6.git/mm/rmap.c > =================================================================== > --- linux-2.6.git.orig/mm/rmap.c > +++ linux-2.6.git/mm/rmap.c > @@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > swp_entry_to_pte(make_hwpoison_entry(page))); > } else if (PageAnon(page)) { > swp_entry_t entry = { .val = page_private(page) }; > + pte_t swp_pte; > > if (PageSwapCache(page)) { > /* > @@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > entry = make_migration_entry(page, pte_write(pteval)); > } > - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); > + swp_pte = swp_entry_to_pte(entry); > + if (pte_soft_dirty(pteval)) > + swp_pte = pte_swp_mksoft_dirty(swp_pte); > + set_pte_at(mm, address, pte, swp_pte); > BUG_ON(pte_file(*pte)); > } else if (IS_ENABLED(CONFIG_MIGRATION) && > (TTU_ACTION(flags) == TTU_MIGRATION)) { > Index: linux-2.6.git/mm/swapfile.c > =================================================================== > --- linux-2.6.git.orig/mm/swapfile.c > +++ linux-2.6.git/mm/swapfile.c > @@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > } > #endif /* CONFIG_HIBERNATION */ > > +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) Nitpick. If maybe_same_pte is used widely, it looks good to me but it's used for only swapoff at the moment so I think pte_swap_same would be better name. > +{ > +#ifdef CONFIG_MEM_SOFT_DIRTY > + /* > + * When pte keeps soft dirty bit the pte generated > + * from swap entry does not has it, still it's same > + * pte from logical point of view. > + */ > + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); > + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); > +#else > + return pte_same(pte, swp_pte); > +#endif > +} > + > /* > * No need to decide whether this PTE shares the swap entry with others, > * just let do_wp_page work it out if a write is requested later - to > @@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > } > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { > + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > mem_cgroup_cancel_charge_swapin(memcg); > ret = 0; > goto out; > @@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > * swapoff spends a _lot_ of time in this loop! > * Test inline before going to call unuse_pte. > */ > - if (unlikely(pte_same(*pte, swp_pte))) { > + if (unlikely(maybe_same_pte(*pte, swp_pte))) { > pte_unmap(pte); > ret = unuse_pte(vma, pmd, addr, entry, page); > if (ret) > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx204.postini.com [74.125.245.204]) by kanga.kvack.org (Postfix) with SMTP id 7E8A56B0031 for ; Thu, 1 Aug 2013 01:53:21 -0400 (EDT) Received: by mail-lb0-f169.google.com with SMTP id u10so1080192lbi.28 for ; Wed, 31 Jul 2013 22:53:19 -0700 (PDT) Date: Thu, 1 Aug 2013 09:53:03 +0400 From: Cyrill Gorcunov Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801055303.GA1764@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801005132.GB19540@bbox> Sender: owner-linux-mm@kvack.org List-ID: To: Minchan Kim Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Thu, Aug 01, 2013 at 09:51:32AM +0900, Minchan Kim wrote: > > Index: linux-2.6.git/include/linux/swapops.h > > =================================================================== > > --- linux-2.6.git.orig/include/linux/swapops.h > > +++ linux-2.6.git/include/linux/swapops.h > > @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > > swp_entry_t arch_entry; > > > > BUG_ON(pte_file(pte)); > > + if (pte_swp_soft_dirty(pte)) > > + pte = pte_swp_clear_soft_dirty(pte); > > Why do you remove soft-dirty flag whenever pte_to_swp_entry is called? > Isn't there any problem if we use mincore? No, there is no problem. pte_to_swp_entry caller when we know that pte we're decoding is having swap format (except the case in swap code which figures out the number of bits allowed for offset). Still since this bit is set on "higher" level than __swp_type/__swp_offset helpers it should be cleaned before the value from pte comes to "one level down" helpers function. > > +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > > Nitpick. > If maybe_same_pte is used widely, it looks good to me > but it's used for only swapoff at the moment so I think pte_swap_same > would be better name. I don't see much difference, but sure, lets rename it on top once series in -mm tree, sounds good? Cyrill -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx174.postini.com [74.125.245.174]) by kanga.kvack.org (Postfix) with SMTP id F3C0F6B0036 for ; Thu, 1 Aug 2013 02:16:05 -0400 (EDT) Date: Thu, 1 Aug 2013 15:16:32 +0900 From: Minchan Kim Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801061632.GE19540@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801055303.GA1764@moon> Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Thu, Aug 01, 2013 at 09:53:03AM +0400, Cyrill Gorcunov wrote: > On Thu, Aug 01, 2013 at 09:51:32AM +0900, Minchan Kim wrote: > > > Index: linux-2.6.git/include/linux/swapops.h > > > =================================================================== > > > --- linux-2.6.git.orig/include/linux/swapops.h > > > +++ linux-2.6.git/include/linux/swapops.h > > > @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > > > swp_entry_t arch_entry; > > > > > > BUG_ON(pte_file(pte)); > > > + if (pte_swp_soft_dirty(pte)) > > > + pte = pte_swp_clear_soft_dirty(pte); > > > > Why do you remove soft-dirty flag whenever pte_to_swp_entry is called? > > Isn't there any problem if we use mincore? > > No, there is no problem. pte_to_swp_entry caller when we know that pte > we're decoding is having swap format (except the case in swap code which > figures out the number of bits allowed for offset). Still since this bit > is set on "higher" level than __swp_type/__swp_offset helpers it should > be cleaned before the value from pte comes to "one level down" helpers > function. I don't get it. Could you correct me with below example? Process A context try_to_unmap swp_pte = swp_entry_to_pte /* change generic swp into arch swap */ swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(, swp_pte); Process A context .. mincore_pte_range pte_to_swp_entry pte = pte_swp_clear_soft_dirty <=== 1) change arch swp with generic swp mincore_page Process B want to know dirty state of the page .. pagemap_read pte_to_pagemap_entry is_swap_pte if (pte_swap_soft_dirty(pte)) <=== but failed by 1) So, Process B can't get the dirty status from process A's the page. > > > > +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > > > > Nitpick. > > If maybe_same_pte is used widely, it looks good to me > > but it's used for only swapoff at the moment so I think pte_swap_same > > would be better name. > > I don't see much difference, but sure, lets rename it on top once series > in -mm tree, sounds good? > > Cyrill > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx146.postini.com [74.125.245.146]) by kanga.kvack.org (Postfix) with SMTP id E9DD36B0037 for ; Thu, 1 Aug 2013 02:28:18 -0400 (EDT) Received: by mail-la0-f53.google.com with SMTP id el20so1141511lab.12 for ; Wed, 31 Jul 2013 23:28:17 -0700 (PDT) Date: Thu, 1 Aug 2013 10:28:14 +0400 From: Cyrill Gorcunov Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801062814.GB1764@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> <20130801061632.GE19540@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801061632.GE19540@bbox> Sender: owner-linux-mm@kvack.org List-ID: To: Minchan Kim Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Thu, Aug 01, 2013 at 03:16:32PM +0900, Minchan Kim wrote: > > I don't get it. Could you correct me with below example? > > Process A context > try_to_unmap > swp_pte = swp_entry_to_pte /* change generic swp into arch swap */ > swp_pte = pte_swp_mksoft_dirty(swp_pte); > set_pte_at(, swp_pte); > > Process A context > .. > mincore_pte_range pte_t pte = *ptep; <-- local copy of the pte value, in memory it remains the same with swap softdirty bit set > pte_to_swp_entry > pte = pte_swp_clear_soft_dirty <=== 1) > change arch swp with generic swp > mincore_page > > Process B want to know dirty state of the page > .. > pagemap_read > pte_to_pagemap_entry > is_swap_pte > if (pte_swap_soft_dirty(pte)) <=== but failed by 1) > > So, Process B can't get the dirty status from process A's the page. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx113.postini.com [74.125.245.113]) by kanga.kvack.org (Postfix) with SMTP id E3C136B0037 for ; Thu, 1 Aug 2013 02:36:40 -0400 (EDT) Date: Thu, 1 Aug 2013 15:37:06 +0900 From: Minchan Kim Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801063706.GF19540@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> <20130801061632.GE19540@bbox> <20130801062814.GB1764@moon> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801062814.GB1764@moon> Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Thu, Aug 01, 2013 at 10:28:14AM +0400, Cyrill Gorcunov wrote: > On Thu, Aug 01, 2013 at 03:16:32PM +0900, Minchan Kim wrote: > > > > I don't get it. Could you correct me with below example? > > > > Process A context > > try_to_unmap > > swp_pte = swp_entry_to_pte /* change generic swp into arch swap */ > > swp_pte = pte_swp_mksoft_dirty(swp_pte); > > set_pte_at(, swp_pte); > > > > Process A context > > .. > > mincore_pte_range > pte_t pte = *ptep; <-- local copy of the pte value, in memory it remains the same > with swap softdirty bit set Argh, I missed that. Thank you! Reviewed-by: Minchan Kim -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx103.postini.com [74.125.245.103]) by kanga.kvack.org (Postfix) with SMTP id EC96A6B0038 for ; Thu, 1 Aug 2013 02:38:17 -0400 (EDT) Received: by mail-la0-f52.google.com with SMTP id fq13so1119937lab.25 for ; Wed, 31 Jul 2013 23:38:15 -0700 (PDT) Date: Thu, 1 Aug 2013 10:38:13 +0400 From: Cyrill Gorcunov Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801063813.GC1764@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> <20130801061632.GE19540@bbox> <20130801062814.GB1764@moon> <20130801063706.GF19540@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801063706.GF19540@bbox> Sender: owner-linux-mm@kvack.org List-ID: To: Minchan Kim Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Thu, Aug 01, 2013 at 03:37:06PM +0900, Minchan Kim wrote: > > Reviewed-by: Minchan Kim Thanks a lot for review, Minchan! -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx198.postini.com [74.125.245.198]) by kanga.kvack.org (Postfix) with SMTP id 0FCEB6B0031 for ; Sun, 4 Aug 2013 21:48:40 -0400 (EDT) Received: from /spool/local by e28smtp08.in.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 5 Aug 2013 07:08:14 +0530 Received: from d28relay04.in.ibm.com (d28relay04.in.ibm.com [9.184.220.61]) by d28dlp01.in.ibm.com (Postfix) with ESMTP id 8E77EE0053 for ; Mon, 5 Aug 2013 07:18:45 +0530 (IST) Received: from d28av05.in.ibm.com (d28av05.in.ibm.com [9.184.220.67]) by d28relay04.in.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r751mS5l39714820 for ; Mon, 5 Aug 2013 07:18:29 +0530 Received: from d28av05.in.ibm.com (localhost [127.0.0.1]) by d28av05.in.ibm.com (8.14.4/8.14.4/NCO v10.0 AVout) with ESMTP id r751mVjg023691 for ; Mon, 5 Aug 2013 07:18:31 +0530 Date: Mon, 5 Aug 2013 09:48:29 +0800 From: Wanpeng Li Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805014829.GA13702@hacker.(null)> Reply-To: Wanpeng Li References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130730204654.844299768@gmail.com> Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY >bit set get swapped out, the bit is getting lost and no longer >available when pte read back. > >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is >saved in pte entry for the page being swapped out. When such page >is to be read back from a swap cache we check for bit presence >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY >bit back. > >One of the problem was to find a place in pte entry where we can >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The >_PAGE_PSE was chosen for that, it doesn't intersect with swap >entry format stored in pte. > >Reported-by: Andy Lutomirski >Signed-off-by: Cyrill Gorcunov >Cc: Pavel Emelyanov >Cc: Andrew Morton >Cc: Matt Mackall >Cc: Xiao Guangrong >Cc: Marcelo Tosatti >Cc: KOSAKI Motohiro >Cc: Stephen Rothwell >Cc: Peter Zijlstra >Cc: "Aneesh Kumar K.V" >--- > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > fs/proc/task_mmu.c | 21 +++++++++++++++------ > include/asm-generic/pgtable.h | 15 +++++++++++++++ > include/linux/swapops.h | 2 ++ > mm/memory.c | 2 ++ > mm/rmap.c | 6 +++++- > mm/swapfile.c | 19 +++++++++++++++++-- > 8 files changed, 84 insertions(+), 9 deletions(-) > >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h >=================================================================== >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > } > >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >+{ >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); >+} >+ >+static inline int pte_swp_soft_dirty(pte_t pte) >+{ >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; >+} >+ >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >+{ >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); >+} >+ > /* > * Mask out unsupported bits in a present pgprot. Non-present pgprots > * can use those bits for other purposes, so leave them be. >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h >=================================================================== >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h >@@ -67,6 +67,19 @@ > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > #endif > >+/* >+ * Tracking soft dirty bit when a page goes to a swap is tricky. >+ * We need a bit which can be stored in pte _and_ not conflict >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved >+ * into swap entry computation, but bit 6 is used for nonlinear >+ * file mapping, so we borrow bit 7 for soft dirty tracking. >+ */ >+#ifdef CONFIG_MEM_SOFT_DIRTY >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE >+#else >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) >+#endif >+ > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > #else >Index: linux-2.6.git/fs/proc/task_mmu.c >=================================================================== >--- linux-2.6.git.orig/fs/proc/task_mmu.c >+++ linux-2.6.git/fs/proc/task_mmu.c >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > * of how soft-dirty works. > */ > pte_t ptent = *pte; >- ptent = pte_wrprotect(ptent); >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >+ >+ if (pte_present(ptent)) { >+ ptent = pte_wrprotect(ptent); >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >+ } else if (is_swap_pte(ptent)) { >+ ptent = pte_swp_clear_soft_dirty(ptent); >+ } >+ > set_pte_at(vma->vm_mm, addr, pte, ptent); > #endif > } >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > for (; addr != end; pte++, addr += PAGE_SIZE) { > ptent = *pte; >- if (!pte_present(ptent)) >- continue; > > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > clear_soft_dirty(vma, addr, pte); > continue; > } > >+ if (!pte_present(ptent)) >+ continue; >+ > page = vm_normal_page(vma, addr, ptent); > if (!page) > continue; >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > flags = PM_PRESENT; > page = vm_normal_page(vma, addr, pte); > } else if (is_swap_pte(pte)) { >- swp_entry_t entry = pte_to_swp_entry(pte); >- >+ swp_entry_t entry; >+ if (pte_swp_soft_dirty(pte)) >+ flags2 |= __PM_SOFT_DIRTY; >+ entry = pte_to_swp_entry(pte); > frame = swp_type(entry) | > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > flags = PM_SWAP; >Index: linux-2.6.git/include/asm-generic/pgtable.h >=================================================================== >--- linux-2.6.git.orig/include/asm-generic/pgtable.h >+++ linux-2.6.git/include/asm-generic/pgtable.h >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > { > return pmd; > } >+ >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >+{ >+ return pte; >+} >+ >+static inline int pte_swp_soft_dirty(pte_t pte) >+{ >+ return 0; >+} >+ >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >+{ >+ return pte; >+} > #endif > > #ifndef __HAVE_PFNMAP_TRACKING >Index: linux-2.6.git/include/linux/swapops.h >=================================================================== >--- linux-2.6.git.orig/include/linux/swapops.h >+++ linux-2.6.git/include/linux/swapops.h >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > swp_entry_t arch_entry; > > BUG_ON(pte_file(pte)); >+ if (pte_swp_soft_dirty(pte)) >+ pte = pte_swp_clear_soft_dirty(pte); > arch_entry = __pte_to_swp_entry(pte); > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > } >Index: linux-2.6.git/mm/memory.c >=================================================================== >--- linux-2.6.git.orig/mm/memory.c >+++ linux-2.6.git/mm/memory.c >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > exclusive = 1; > } > flush_icache_page(vma, page); >+ if (pte_swp_soft_dirty(orig_pte)) >+ pte = pte_mksoft_dirty(pte); entry = pte_to_swp_entry(orig_pte); orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. > set_pte_at(mm, address, page_table, pte); > if (page == swapcache) > do_page_add_anon_rmap(page, vma, address, exclusive); >Index: linux-2.6.git/mm/rmap.c >=================================================================== >--- linux-2.6.git.orig/mm/rmap.c >+++ linux-2.6.git/mm/rmap.c >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > swp_entry_to_pte(make_hwpoison_entry(page))); > } else if (PageAnon(page)) { > swp_entry_t entry = { .val = page_private(page) }; >+ pte_t swp_pte; > > if (PageSwapCache(page)) { > /* >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > entry = make_migration_entry(page, pte_write(pteval)); > } >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); >+ swp_pte = swp_entry_to_pte(entry); >+ if (pte_soft_dirty(pteval)) >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); >+ set_pte_at(mm, address, pte, swp_pte); > BUG_ON(pte_file(*pte)); > } else if (IS_ENABLED(CONFIG_MIGRATION) && > (TTU_ACTION(flags) == TTU_MIGRATION)) { >Index: linux-2.6.git/mm/swapfile.c >=================================================================== >--- linux-2.6.git.orig/mm/swapfile.c >+++ linux-2.6.git/mm/swapfile.c >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > } > #endif /* CONFIG_HIBERNATION */ > >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) >+{ >+#ifdef CONFIG_MEM_SOFT_DIRTY >+ /* >+ * When pte keeps soft dirty bit the pte generated >+ * from swap entry does not has it, still it's same >+ * pte from logical point of view. >+ */ >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); >+#else >+ return pte_same(pte, swp_pte); >+#endif >+} >+ > /* > * No need to decide whether this PTE shares the swap entry with others, > * just let do_wp_page work it out if a write is requested later - to >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > } > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > mem_cgroup_cancel_charge_swapin(memcg); > ret = 0; > goto out; >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > * swapoff spends a _lot_ of time in this loop! > * Test inline before going to call unuse_pte. > */ >- if (unlikely(pte_same(*pte, swp_pte))) { >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { > pte_unmap(pte); > ret = unuse_pte(vma, pmd, addr, entry, page); > if (ret) > >-- >To unsubscribe, send a message with 'unsubscribe linux-mm' in >the body to majordomo@kvack.org. For more info on Linux MM, >see: http://www.linux-mm.org/ . >Don't email: email@kvack.org -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx180.postini.com [74.125.245.180]) by kanga.kvack.org (Postfix) with SMTP id DAAFF6B0031 for ; Sun, 4 Aug 2013 22:16:35 -0400 (EDT) Date: Mon, 5 Aug 2013 11:17:15 +0900 From: Minchan Kim Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805021715.GJ32486@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> Sender: owner-linux-mm@kvack.org List-ID: To: Wanpeng Li Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Hello Wanpeng, On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: > On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: > >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > >bit set get swapped out, the bit is getting lost and no longer > >available when pte read back. > > > >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > >saved in pte entry for the page being swapped out. When such page > >is to be read back from a swap cache we check for bit presence > >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > >bit back. > > > >One of the problem was to find a place in pte entry where we can > >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > >_PAGE_PSE was chosen for that, it doesn't intersect with swap > >entry format stored in pte. > > > >Reported-by: Andy Lutomirski > >Signed-off-by: Cyrill Gorcunov > >Cc: Pavel Emelyanov > >Cc: Andrew Morton > >Cc: Matt Mackall > >Cc: Xiao Guangrong > >Cc: Marcelo Tosatti > >Cc: KOSAKI Motohiro > >Cc: Stephen Rothwell > >Cc: Peter Zijlstra > >Cc: "Aneesh Kumar K.V" > >--- > > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > > fs/proc/task_mmu.c | 21 +++++++++++++++------ > > include/asm-generic/pgtable.h | 15 +++++++++++++++ > > include/linux/swapops.h | 2 ++ > > mm/memory.c | 2 ++ > > mm/rmap.c | 6 +++++- > > mm/swapfile.c | 19 +++++++++++++++++-- > > 8 files changed, 84 insertions(+), 9 deletions(-) > > > >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h > >=================================================================== > >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h > >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h > >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > > } > > > >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >+{ > >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >+} > >+ > >+static inline int pte_swp_soft_dirty(pte_t pte) > >+{ > >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; > >+} > >+ > >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >+{ > >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >+} > >+ > > /* > > * Mask out unsupported bits in a present pgprot. Non-present pgprots > > * can use those bits for other purposes, so leave them be. > >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >=================================================================== > >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h > >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >@@ -67,6 +67,19 @@ > > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > > #endif > > > >+/* > >+ * Tracking soft dirty bit when a page goes to a swap is tricky. > >+ * We need a bit which can be stored in pte _and_ not conflict > >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved > >+ * into swap entry computation, but bit 6 is used for nonlinear > >+ * file mapping, so we borrow bit 7 for soft dirty tracking. > >+ */ > >+#ifdef CONFIG_MEM_SOFT_DIRTY > >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE > >+#else > >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) > >+#endif > >+ > > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > > #else > >Index: linux-2.6.git/fs/proc/task_mmu.c > >=================================================================== > >--- linux-2.6.git.orig/fs/proc/task_mmu.c > >+++ linux-2.6.git/fs/proc/task_mmu.c > >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > > * of how soft-dirty works. > > */ > > pte_t ptent = *pte; > >- ptent = pte_wrprotect(ptent); > >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >+ > >+ if (pte_present(ptent)) { > >+ ptent = pte_wrprotect(ptent); > >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >+ } else if (is_swap_pte(ptent)) { > >+ ptent = pte_swp_clear_soft_dirty(ptent); > >+ } > >+ > > set_pte_at(vma->vm_mm, addr, pte, ptent); > > #endif > > } > >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > > for (; addr != end; pte++, addr += PAGE_SIZE) { > > ptent = *pte; > >- if (!pte_present(ptent)) > >- continue; > > > > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > > clear_soft_dirty(vma, addr, pte); > > continue; > > } > > > >+ if (!pte_present(ptent)) > >+ continue; > >+ > > page = vm_normal_page(vma, addr, ptent); > > if (!page) > > continue; > >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > > flags = PM_PRESENT; > > page = vm_normal_page(vma, addr, pte); > > } else if (is_swap_pte(pte)) { > >- swp_entry_t entry = pte_to_swp_entry(pte); > >- > >+ swp_entry_t entry; > >+ if (pte_swp_soft_dirty(pte)) > >+ flags2 |= __PM_SOFT_DIRTY; > >+ entry = pte_to_swp_entry(pte); > > frame = swp_type(entry) | > > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > > flags = PM_SWAP; > >Index: linux-2.6.git/include/asm-generic/pgtable.h > >=================================================================== > >--- linux-2.6.git.orig/include/asm-generic/pgtable.h > >+++ linux-2.6.git/include/asm-generic/pgtable.h > >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > > { > > return pmd; > > } > >+ > >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >+{ > >+ return pte; > >+} > >+ > >+static inline int pte_swp_soft_dirty(pte_t pte) > >+{ > >+ return 0; > >+} > >+ > >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >+{ > >+ return pte; > >+} > > #endif > > > > #ifndef __HAVE_PFNMAP_TRACKING > >Index: linux-2.6.git/include/linux/swapops.h > >=================================================================== > >--- linux-2.6.git.orig/include/linux/swapops.h > >+++ linux-2.6.git/include/linux/swapops.h > >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > > swp_entry_t arch_entry; > > > > BUG_ON(pte_file(pte)); > >+ if (pte_swp_soft_dirty(pte)) > >+ pte = pte_swp_clear_soft_dirty(pte); > > arch_entry = __pte_to_swp_entry(pte); > > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > > } > >Index: linux-2.6.git/mm/memory.c > >=================================================================== > >--- linux-2.6.git.orig/mm/memory.c > >+++ linux-2.6.git/mm/memory.c > >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > > exclusive = 1; > > } > > flush_icache_page(vma, page); > >+ if (pte_swp_soft_dirty(orig_pte)) > >+ pte = pte_mksoft_dirty(pte); > > entry = pte_to_swp_entry(orig_pte); > orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. You seem to walk same way with me. Please look at my stupid questions in this thread. > > > set_pte_at(mm, address, page_table, pte); > > if (page == swapcache) > > do_page_add_anon_rmap(page, vma, address, exclusive); > >Index: linux-2.6.git/mm/rmap.c > >=================================================================== > >--- linux-2.6.git.orig/mm/rmap.c > >+++ linux-2.6.git/mm/rmap.c > >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > > swp_entry_to_pte(make_hwpoison_entry(page))); > > } else if (PageAnon(page)) { > > swp_entry_t entry = { .val = page_private(page) }; > >+ pte_t swp_pte; > > > > if (PageSwapCache(page)) { > > /* > >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > > entry = make_migration_entry(page, pte_write(pteval)); > > } > >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); > >+ swp_pte = swp_entry_to_pte(entry); > >+ if (pte_soft_dirty(pteval)) > >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); > >+ set_pte_at(mm, address, pte, swp_pte); > > BUG_ON(pte_file(*pte)); > > } else if (IS_ENABLED(CONFIG_MIGRATION) && > > (TTU_ACTION(flags) == TTU_MIGRATION)) { > >Index: linux-2.6.git/mm/swapfile.c > >=================================================================== > >--- linux-2.6.git.orig/mm/swapfile.c > >+++ linux-2.6.git/mm/swapfile.c > >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > > } > > #endif /* CONFIG_HIBERNATION */ > > > >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > >+{ > >+#ifdef CONFIG_MEM_SOFT_DIRTY > >+ /* > >+ * When pte keeps soft dirty bit the pte generated > >+ * from swap entry does not has it, still it's same > >+ * pte from logical point of view. > >+ */ > >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); > >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); > >+#else > >+ return pte_same(pte, swp_pte); > >+#endif > >+} > >+ > > /* > > * No need to decide whether this PTE shares the swap entry with others, > > * just let do_wp_page work it out if a write is requested later - to > >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > > } > > > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { > >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > > mem_cgroup_cancel_charge_swapin(memcg); > > ret = 0; > > goto out; > >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > > * swapoff spends a _lot_ of time in this loop! > > * Test inline before going to call unuse_pte. > > */ > >- if (unlikely(pte_same(*pte, swp_pte))) { > >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { > > pte_unmap(pte); > > ret = unuse_pte(vma, pmd, addr, entry, page); > > if (ret) > > > >-- > >To unsubscribe, send a message with 'unsubscribe linux-mm' in > >the body to majordomo@kvack.org. For more info on Linux MM, > >see: http://www.linux-mm.org/ . > >Don't email: email@kvack.org > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx168.postini.com [74.125.245.168]) by kanga.kvack.org (Postfix) with SMTP id 4C5EA6B0031 for ; Sun, 4 Aug 2013 22:39:11 -0400 (EDT) Received: from /spool/local by e23smtp03.au.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 5 Aug 2013 12:28:34 +1000 Received: from d23relay04.au.ibm.com (d23relay04.au.ibm.com [9.190.234.120]) by d23dlp02.au.ibm.com (Postfix) with ESMTP id 743B22BB0055 for ; Mon, 5 Aug 2013 12:39:02 +1000 (EST) Received: from d23av01.au.ibm.com (d23av01.au.ibm.com [9.190.234.96]) by d23relay04.au.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r752NOSF54722630 for ; Mon, 5 Aug 2013 12:23:24 +1000 Received: from d23av01.au.ibm.com (localhost [127.0.0.1]) by d23av01.au.ibm.com (8.14.4/8.14.4/NCO v10.0 AVout) with ESMTP id r752d0n2018203 for ; Mon, 5 Aug 2013 12:39:01 +1000 Date: Mon, 5 Aug 2013 10:38:58 +0800 From: Wanpeng Li Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805023858.GA1039@hacker.(null)> Reply-To: Wanpeng Li References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130805021715.GJ32486@bbox> Sender: owner-linux-mm@kvack.org List-ID: To: Minchan Kim Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Hi Minchan, On Mon, Aug 05, 2013 at 11:17:15AM +0900, Minchan Kim wrote: >Hello Wanpeng, > >On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: >> On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: >> >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY >> >bit set get swapped out, the bit is getting lost and no longer >> >available when pte read back. >> > >> >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is >> >saved in pte entry for the page being swapped out. When such page >> >is to be read back from a swap cache we check for bit presence >> >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY >> >bit back. >> > >> >One of the problem was to find a place in pte entry where we can >> >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The >> >_PAGE_PSE was chosen for that, it doesn't intersect with swap >> >entry format stored in pte. >> > >> >Reported-by: Andy Lutomirski >> >Signed-off-by: Cyrill Gorcunov >> >Cc: Pavel Emelyanov >> >Cc: Andrew Morton >> >Cc: Matt Mackall >> >Cc: Xiao Guangrong >> >Cc: Marcelo Tosatti >> >Cc: KOSAKI Motohiro >> >Cc: Stephen Rothwell >> >Cc: Peter Zijlstra >> >Cc: "Aneesh Kumar K.V" >> >--- >> > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ >> > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ >> > fs/proc/task_mmu.c | 21 +++++++++++++++------ >> > include/asm-generic/pgtable.h | 15 +++++++++++++++ >> > include/linux/swapops.h | 2 ++ >> > mm/memory.c | 2 ++ >> > mm/rmap.c | 6 +++++- >> > mm/swapfile.c | 19 +++++++++++++++++-- >> > 8 files changed, 84 insertions(+), 9 deletions(-) >> > >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h >> >=================================================================== >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h >> >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); >> > } >> > >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >+{ >> >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >+} >> >+ >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >+{ >> >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; >> >+} >> >+ >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >+{ >> >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >+} >> >+ >> > /* >> > * Mask out unsupported bits in a present pgprot. Non-present pgprots >> > * can use those bits for other purposes, so leave them be. >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >=================================================================== >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >@@ -67,6 +67,19 @@ >> > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) >> > #endif >> > >> >+/* >> >+ * Tracking soft dirty bit when a page goes to a swap is tricky. >> >+ * We need a bit which can be stored in pte _and_ not conflict >> >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved >> >+ * into swap entry computation, but bit 6 is used for nonlinear >> >+ * file mapping, so we borrow bit 7 for soft dirty tracking. >> >+ */ >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE >> >+#else >> >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) >> >+#endif >> >+ >> > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) >> > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) >> > #else >> >Index: linux-2.6.git/fs/proc/task_mmu.c >> >=================================================================== >> >--- linux-2.6.git.orig/fs/proc/task_mmu.c >> >+++ linux-2.6.git/fs/proc/task_mmu.c >> >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru >> > * of how soft-dirty works. >> > */ >> > pte_t ptent = *pte; >> >- ptent = pte_wrprotect(ptent); >> >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >+ >> >+ if (pte_present(ptent)) { >> >+ ptent = pte_wrprotect(ptent); >> >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >+ } else if (is_swap_pte(ptent)) { >> >+ ptent = pte_swp_clear_soft_dirty(ptent); >> >+ } >> >+ >> > set_pte_at(vma->vm_mm, addr, pte, ptent); >> > #endif >> > } >> >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> > for (; addr != end; pte++, addr += PAGE_SIZE) { >> > ptent = *pte; >> >- if (!pte_present(ptent)) >> >- continue; >> > >> > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { >> > clear_soft_dirty(vma, addr, pte); >> > continue; >> > } >> > >> >+ if (!pte_present(ptent)) >> >+ continue; >> >+ >> > page = vm_normal_page(vma, addr, ptent); >> > if (!page) >> > continue; >> >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap >> > flags = PM_PRESENT; >> > page = vm_normal_page(vma, addr, pte); >> > } else if (is_swap_pte(pte)) { >> >- swp_entry_t entry = pte_to_swp_entry(pte); >> >- >> >+ swp_entry_t entry; >> >+ if (pte_swp_soft_dirty(pte)) >> >+ flags2 |= __PM_SOFT_DIRTY; >> >+ entry = pte_to_swp_entry(pte); >> > frame = swp_type(entry) | >> > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); >> > flags = PM_SWAP; >> >Index: linux-2.6.git/include/asm-generic/pgtable.h >> >=================================================================== >> >--- linux-2.6.git.orig/include/asm-generic/pgtable.h >> >+++ linux-2.6.git/include/asm-generic/pgtable.h >> >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> > { >> > return pmd; >> > } >> >+ >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >+{ >> >+ return pte; >> >+} >> >+ >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >+{ >> >+ return 0; >> >+} >> >+ >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >+{ >> >+ return pte; >> >+} >> > #endif >> > >> > #ifndef __HAVE_PFNMAP_TRACKING >> >Index: linux-2.6.git/include/linux/swapops.h >> >=================================================================== >> >--- linux-2.6.git.orig/include/linux/swapops.h >> >+++ linux-2.6.git/include/linux/swapops.h >> >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent >> > swp_entry_t arch_entry; >> > >> > BUG_ON(pte_file(pte)); >> >+ if (pte_swp_soft_dirty(pte)) >> >+ pte = pte_swp_clear_soft_dirty(pte); >> > arch_entry = __pte_to_swp_entry(pte); >> > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); >> > } >> >Index: linux-2.6.git/mm/memory.c >> >=================================================================== >> >--- linux-2.6.git.orig/mm/memory.c >> >+++ linux-2.6.git/mm/memory.c >> >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct >> > exclusive = 1; >> > } >> > flush_icache_page(vma, page); >> >+ if (pte_swp_soft_dirty(orig_pte)) >> >+ pte = pte_mksoft_dirty(pte); >> >> entry = pte_to_swp_entry(orig_pte); >> orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. > >You seem to walk same way with me. >Please look at my stupid questions in this thread. > I see your discussion with Cyrill, however, pte_to_swp_entry and pte_swp_soft_dirty both against orig_pte, where I miss? ;-) >> >> > set_pte_at(mm, address, page_table, pte); >> > if (page == swapcache) >> > do_page_add_anon_rmap(page, vma, address, exclusive); >> >Index: linux-2.6.git/mm/rmap.c >> >=================================================================== >> >--- linux-2.6.git.orig/mm/rmap.c >> >+++ linux-2.6.git/mm/rmap.c >> >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, >> > swp_entry_to_pte(make_hwpoison_entry(page))); >> > } else if (PageAnon(page)) { >> > swp_entry_t entry = { .val = page_private(page) }; >> >+ pte_t swp_pte; >> > >> > if (PageSwapCache(page)) { >> > /* >> >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, >> > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); >> > entry = make_migration_entry(page, pte_write(pteval)); >> > } >> >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); >> >+ swp_pte = swp_entry_to_pte(entry); >> >+ if (pte_soft_dirty(pteval)) >> >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); >> >+ set_pte_at(mm, address, pte, swp_pte); >> > BUG_ON(pte_file(*pte)); >> > } else if (IS_ENABLED(CONFIG_MIGRATION) && >> > (TTU_ACTION(flags) == TTU_MIGRATION)) { >> >Index: linux-2.6.git/mm/swapfile.c >> >=================================================================== >> >--- linux-2.6.git.orig/mm/swapfile.c >> >+++ linux-2.6.git/mm/swapfile.c >> >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, >> > } >> > #endif /* CONFIG_HIBERNATION */ >> > >> >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) >> >+{ >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >+ /* >> >+ * When pte keeps soft dirty bit the pte generated >> >+ * from swap entry does not has it, still it's same >> >+ * pte from logical point of view. >> >+ */ >> >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); >> >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); >> >+#else >> >+ return pte_same(pte, swp_pte); >> >+#endif >> >+} >> >+ >> > /* >> > * No need to decide whether this PTE shares the swap entry with others, >> > * just let do_wp_page work it out if a write is requested later - to >> >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru >> > } >> > >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { >> >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { >> > mem_cgroup_cancel_charge_swapin(memcg); >> > ret = 0; >> > goto out; >> >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are >> > * swapoff spends a _lot_ of time in this loop! >> > * Test inline before going to call unuse_pte. >> > */ >> >- if (unlikely(pte_same(*pte, swp_pte))) { >> >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { >> > pte_unmap(pte); >> > ret = unuse_pte(vma, pmd, addr, entry, page); >> > if (ret) >> > >> >-- >> >To unsubscribe, send a message with 'unsubscribe linux-mm' in >> >the body to majordomo@kvack.org. For more info on Linux MM, >> >see: http://www.linux-mm.org/ . >> >Don't email: email@kvack.org >> >> -- >> To unsubscribe, send a message with 'unsubscribe linux-mm' in >> the body to majordomo@kvack.org. For more info on Linux MM, >> see: http://www.linux-mm.org/ . >> Don't email: email@kvack.org > >-- >Kind regards, >Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx145.postini.com [74.125.245.145]) by kanga.kvack.org (Postfix) with SMTP id AB2526B0033 for ; Sun, 4 Aug 2013 22:53:57 -0400 (EDT) Date: Mon, 5 Aug 2013 11:54:37 +0900 From: Minchan Kim Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805025437.GK32486@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> Sender: owner-linux-mm@kvack.org List-ID: To: Wanpeng Li Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Mon, Aug 05, 2013 at 10:38:58AM +0800, Wanpeng Li wrote: > Hi Minchan, > > On Mon, Aug 05, 2013 at 11:17:15AM +0900, Minchan Kim wrote: > >Hello Wanpeng, > > > >On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: > >> On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: > >> >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > >> >bit set get swapped out, the bit is getting lost and no longer > >> >available when pte read back. > >> > > >> >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > >> >saved in pte entry for the page being swapped out. When such page > >> >is to be read back from a swap cache we check for bit presence > >> >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > >> >bit back. > >> > > >> >One of the problem was to find a place in pte entry where we can > >> >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > >> >_PAGE_PSE was chosen for that, it doesn't intersect with swap > >> >entry format stored in pte. > >> > > >> >Reported-by: Andy Lutomirski > >> >Signed-off-by: Cyrill Gorcunov > >> >Cc: Pavel Emelyanov > >> >Cc: Andrew Morton > >> >Cc: Matt Mackall > >> >Cc: Xiao Guangrong > >> >Cc: Marcelo Tosatti > >> >Cc: KOSAKI Motohiro > >> >Cc: Stephen Rothwell > >> >Cc: Peter Zijlstra > >> >Cc: "Aneesh Kumar K.V" > >> >--- > >> > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > >> > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > >> > fs/proc/task_mmu.c | 21 +++++++++++++++------ > >> > include/asm-generic/pgtable.h | 15 +++++++++++++++ > >> > include/linux/swapops.h | 2 ++ > >> > mm/memory.c | 2 ++ > >> > mm/rmap.c | 6 +++++- > >> > mm/swapfile.c | 19 +++++++++++++++++-- > >> > 8 files changed, 84 insertions(+), 9 deletions(-) > >> > > >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h > >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h > >> >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > >> > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > >> > } > >> > > >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >> >+{ > >> >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >> >+} > >> >+ > >> >+static inline int pte_swp_soft_dirty(pte_t pte) > >> >+{ > >> >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; > >> >+} > >> >+ > >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >> >+{ > >> >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >> >+} > >> >+ > >> > /* > >> > * Mask out unsupported bits in a present pgprot. Non-present pgprots > >> > * can use those bits for other purposes, so leave them be. > >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h > >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >> >@@ -67,6 +67,19 @@ > >> > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > >> > #endif > >> > > >> >+/* > >> >+ * Tracking soft dirty bit when a page goes to a swap is tricky. > >> >+ * We need a bit which can be stored in pte _and_ not conflict > >> >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved > >> >+ * into swap entry computation, but bit 6 is used for nonlinear > >> >+ * file mapping, so we borrow bit 7 for soft dirty tracking. > >> >+ */ > >> >+#ifdef CONFIG_MEM_SOFT_DIRTY > >> >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE > >> >+#else > >> >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) > >> >+#endif > >> >+ > >> > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > >> > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > >> > #else > >> >Index: linux-2.6.git/fs/proc/task_mmu.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/fs/proc/task_mmu.c > >> >+++ linux-2.6.git/fs/proc/task_mmu.c > >> >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > >> > * of how soft-dirty works. > >> > */ > >> > pte_t ptent = *pte; > >> >- ptent = pte_wrprotect(ptent); > >> >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >> >+ > >> >+ if (pte_present(ptent)) { > >> >+ ptent = pte_wrprotect(ptent); > >> >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >> >+ } else if (is_swap_pte(ptent)) { > >> >+ ptent = pte_swp_clear_soft_dirty(ptent); > >> >+ } > >> >+ > >> > set_pte_at(vma->vm_mm, addr, pte, ptent); > >> > #endif > >> > } > >> >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > >> > for (; addr != end; pte++, addr += PAGE_SIZE) { > >> > ptent = *pte; > >> >- if (!pte_present(ptent)) > >> >- continue; > >> > > >> > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > >> > clear_soft_dirty(vma, addr, pte); > >> > continue; > >> > } > >> > > >> >+ if (!pte_present(ptent)) > >> >+ continue; > >> >+ > >> > page = vm_normal_page(vma, addr, ptent); > >> > if (!page) > >> > continue; > >> >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > >> > flags = PM_PRESENT; > >> > page = vm_normal_page(vma, addr, pte); > >> > } else if (is_swap_pte(pte)) { > >> >- swp_entry_t entry = pte_to_swp_entry(pte); > >> >- > >> >+ swp_entry_t entry; > >> >+ if (pte_swp_soft_dirty(pte)) > >> >+ flags2 |= __PM_SOFT_DIRTY; > >> >+ entry = pte_to_swp_entry(pte); > >> > frame = swp_type(entry) | > >> > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > >> > flags = PM_SWAP; > >> >Index: linux-2.6.git/include/asm-generic/pgtable.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/include/asm-generic/pgtable.h > >> >+++ linux-2.6.git/include/asm-generic/pgtable.h > >> >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > >> > { > >> > return pmd; > >> > } > >> >+ > >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >> >+{ > >> >+ return pte; > >> >+} > >> >+ > >> >+static inline int pte_swp_soft_dirty(pte_t pte) > >> >+{ > >> >+ return 0; > >> >+} > >> >+ > >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >> >+{ > >> >+ return pte; > >> >+} > >> > #endif > >> > > >> > #ifndef __HAVE_PFNMAP_TRACKING > >> >Index: linux-2.6.git/include/linux/swapops.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/include/linux/swapops.h > >> >+++ linux-2.6.git/include/linux/swapops.h > >> >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > >> > swp_entry_t arch_entry; > >> > > >> > BUG_ON(pte_file(pte)); > >> >+ if (pte_swp_soft_dirty(pte)) > >> >+ pte = pte_swp_clear_soft_dirty(pte); > >> > arch_entry = __pte_to_swp_entry(pte); > >> > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > >> > } > >> >Index: linux-2.6.git/mm/memory.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/mm/memory.c > >> >+++ linux-2.6.git/mm/memory.c > >> >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > >> > exclusive = 1; > >> > } > >> > flush_icache_page(vma, page); > >> >+ if (pte_swp_soft_dirty(orig_pte)) > >> >+ pte = pte_mksoft_dirty(pte); > >> > >> entry = pte_to_swp_entry(orig_pte); > >> orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. > > > >You seem to walk same way with me. > >Please look at my stupid questions in this thread. > > > > I see your discussion with Cyrill, however, pte_to_swp_entry and pte_swp_soft_dirty > both against orig_pte, where I miss? ;-) pte_to_swp_entry is passed orig_pte by vaule, not a pointer so although pte_to_swp_entry clear out _PTE_SWP_SOFT_DIRTY, it does it in local-copy. So orig_pte is never changed. > > >> > >> > set_pte_at(mm, address, page_table, pte); > >> > if (page == swapcache) > >> > do_page_add_anon_rmap(page, vma, address, exclusive); > >> >Index: linux-2.6.git/mm/rmap.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/mm/rmap.c > >> >+++ linux-2.6.git/mm/rmap.c > >> >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > >> > swp_entry_to_pte(make_hwpoison_entry(page))); > >> > } else if (PageAnon(page)) { > >> > swp_entry_t entry = { .val = page_private(page) }; > >> >+ pte_t swp_pte; > >> > > >> > if (PageSwapCache(page)) { > >> > /* > >> >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > >> > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > >> > entry = make_migration_entry(page, pte_write(pteval)); > >> > } > >> >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); > >> >+ swp_pte = swp_entry_to_pte(entry); > >> >+ if (pte_soft_dirty(pteval)) > >> >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); > >> >+ set_pte_at(mm, address, pte, swp_pte); > >> > BUG_ON(pte_file(*pte)); > >> > } else if (IS_ENABLED(CONFIG_MIGRATION) && > >> > (TTU_ACTION(flags) == TTU_MIGRATION)) { > >> >Index: linux-2.6.git/mm/swapfile.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/mm/swapfile.c > >> >+++ linux-2.6.git/mm/swapfile.c > >> >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > >> > } > >> > #endif /* CONFIG_HIBERNATION */ > >> > > >> >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > >> >+{ > >> >+#ifdef CONFIG_MEM_SOFT_DIRTY > >> >+ /* > >> >+ * When pte keeps soft dirty bit the pte generated > >> >+ * from swap entry does not has it, still it's same > >> >+ * pte from logical point of view. > >> >+ */ > >> >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); > >> >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); > >> >+#else > >> >+ return pte_same(pte, swp_pte); > >> >+#endif > >> >+} > >> >+ > >> > /* > >> > * No need to decide whether this PTE shares the swap entry with others, > >> > * just let do_wp_page work it out if a write is requested later - to > >> >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > >> > } > >> > > >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > >> >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { > >> >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > >> > mem_cgroup_cancel_charge_swapin(memcg); > >> > ret = 0; > >> > goto out; > >> >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > >> > * swapoff spends a _lot_ of time in this loop! > >> > * Test inline before going to call unuse_pte. > >> > */ > >> >- if (unlikely(pte_same(*pte, swp_pte))) { > >> >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { > >> > pte_unmap(pte); > >> > ret = unuse_pte(vma, pmd, addr, entry, page); > >> > if (ret) > >> > > >> >-- > >> >To unsubscribe, send a message with 'unsubscribe linux-mm' in > >> >the body to majordomo@kvack.org. For more info on Linux MM, > >> >see: http://www.linux-mm.org/ . > >> >Don't email: email@kvack.org > >> > >> -- > >> To unsubscribe, send a message with 'unsubscribe linux-mm' in > >> the body to majordomo@kvack.org. For more info on Linux MM, > >> see: http://www.linux-mm.org/ . > >> Don't email: email@kvack.org > > > >-- > >Kind regards, > >Minchan Kim > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx187.postini.com [74.125.245.187]) by kanga.kvack.org (Postfix) with SMTP id DCA866B0031 for ; Sun, 4 Aug 2013 22:58:47 -0400 (EDT) Received: from /spool/local by e23smtp09.au.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 5 Aug 2013 23:53:38 +1000 Received: from d23relay04.au.ibm.com (d23relay04.au.ibm.com [9.190.234.120]) by d23dlp02.au.ibm.com (Postfix) with ESMTP id BDCC62BB0054 for ; Mon, 5 Aug 2013 12:58:42 +1000 (EST) Received: from d23av03.au.ibm.com (d23av03.au.ibm.com [9.190.234.97]) by d23relay04.au.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r752h0BZ65011806 for ; Mon, 5 Aug 2013 12:43:04 +1000 Received: from d23av03.au.ibm.com (localhost [127.0.0.1]) by d23av03.au.ibm.com (8.14.4/8.14.4/NCO v10.0 AVout) with ESMTP id r752wbqN023183 for ; Mon, 5 Aug 2013 12:58:38 +1000 Date: Mon, 5 Aug 2013 10:58:35 +0800 From: Wanpeng Li Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805025835.GA6722@hacker.(null)> Reply-To: Wanpeng Li References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> <20130805025437.GK32486@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130805025437.GK32486@bbox> Sender: owner-linux-mm@kvack.org List-ID: To: Minchan Kim Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Mon, Aug 05, 2013 at 11:54:37AM +0900, Minchan Kim wrote: >On Mon, Aug 05, 2013 at 10:38:58AM +0800, Wanpeng Li wrote: >> Hi Minchan, >> >> On Mon, Aug 05, 2013 at 11:17:15AM +0900, Minchan Kim wrote: >> >Hello Wanpeng, >> > >> >On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: >> >> On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: >> >> >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY >> >> >bit set get swapped out, the bit is getting lost and no longer >> >> >available when pte read back. >> >> > >> >> >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is >> >> >saved in pte entry for the page being swapped out. When such page >> >> >is to be read back from a swap cache we check for bit presence >> >> >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY >> >> >bit back. >> >> > >> >> >One of the problem was to find a place in pte entry where we can >> >> >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The >> >> >_PAGE_PSE was chosen for that, it doesn't intersect with swap >> >> >entry format stored in pte. >> >> > >> >> >Reported-by: Andy Lutomirski >> >> >Signed-off-by: Cyrill Gorcunov >> >> >Cc: Pavel Emelyanov >> >> >Cc: Andrew Morton >> >> >Cc: Matt Mackall >> >> >Cc: Xiao Guangrong >> >> >Cc: Marcelo Tosatti >> >> >Cc: KOSAKI Motohiro >> >> >Cc: Stephen Rothwell >> >> >Cc: Peter Zijlstra >> >> >Cc: "Aneesh Kumar K.V" >> >> >--- >> >> > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ >> >> > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ >> >> > fs/proc/task_mmu.c | 21 +++++++++++++++------ >> >> > include/asm-generic/pgtable.h | 15 +++++++++++++++ >> >> > include/linux/swapops.h | 2 ++ >> >> > mm/memory.c | 2 ++ >> >> > mm/rmap.c | 6 +++++- >> >> > mm/swapfile.c | 19 +++++++++++++++++-- >> >> > 8 files changed, 84 insertions(+), 9 deletions(-) >> >> > >> >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h >> >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h >> >> >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> >> > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); >> >> > } >> >> > >> >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >> >+} >> >> >+ >> >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; >> >> >+} >> >> >+ >> >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >> >+} >> >> >+ >> >> > /* >> >> > * Mask out unsupported bits in a present pgprot. Non-present pgprots >> >> > * can use those bits for other purposes, so leave them be. >> >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h >> >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >> >@@ -67,6 +67,19 @@ >> >> > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) >> >> > #endif >> >> > >> >> >+/* >> >> >+ * Tracking soft dirty bit when a page goes to a swap is tricky. >> >> >+ * We need a bit which can be stored in pte _and_ not conflict >> >> >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved >> >> >+ * into swap entry computation, but bit 6 is used for nonlinear >> >> >+ * file mapping, so we borrow bit 7 for soft dirty tracking. >> >> >+ */ >> >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >> >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE >> >> >+#else >> >> >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) >> >> >+#endif >> >> >+ >> >> > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) >> >> > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) >> >> > #else >> >> >Index: linux-2.6.git/fs/proc/task_mmu.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/fs/proc/task_mmu.c >> >> >+++ linux-2.6.git/fs/proc/task_mmu.c >> >> >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru >> >> > * of how soft-dirty works. >> >> > */ >> >> > pte_t ptent = *pte; >> >> >- ptent = pte_wrprotect(ptent); >> >> >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >> >+ >> >> >+ if (pte_present(ptent)) { >> >> >+ ptent = pte_wrprotect(ptent); >> >> >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >> >+ } else if (is_swap_pte(ptent)) { >> >> >+ ptent = pte_swp_clear_soft_dirty(ptent); >> >> >+ } >> >> >+ >> >> > set_pte_at(vma->vm_mm, addr, pte, ptent); >> >> > #endif >> >> > } >> >> >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p >> >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> >> > for (; addr != end; pte++, addr += PAGE_SIZE) { >> >> > ptent = *pte; >> >> >- if (!pte_present(ptent)) >> >> >- continue; >> >> > >> >> > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { >> >> > clear_soft_dirty(vma, addr, pte); >> >> > continue; >> >> > } >> >> > >> >> >+ if (!pte_present(ptent)) >> >> >+ continue; >> >> >+ >> >> > page = vm_normal_page(vma, addr, ptent); >> >> > if (!page) >> >> > continue; >> >> >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap >> >> > flags = PM_PRESENT; >> >> > page = vm_normal_page(vma, addr, pte); >> >> > } else if (is_swap_pte(pte)) { >> >> >- swp_entry_t entry = pte_to_swp_entry(pte); >> >> >- >> >> >+ swp_entry_t entry; >> >> >+ if (pte_swp_soft_dirty(pte)) >> >> >+ flags2 |= __PM_SOFT_DIRTY; >> >> >+ entry = pte_to_swp_entry(pte); >> >> > frame = swp_type(entry) | >> >> > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); >> >> > flags = PM_SWAP; >> >> >Index: linux-2.6.git/include/asm-generic/pgtable.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/include/asm-generic/pgtable.h >> >> >+++ linux-2.6.git/include/asm-generic/pgtable.h >> >> >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> >> > { >> >> > return pmd; >> >> > } >> >> >+ >> >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte; >> >> >+} >> >> >+ >> >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return 0; >> >> >+} >> >> >+ >> >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte; >> >> >+} >> >> > #endif >> >> > >> >> > #ifndef __HAVE_PFNMAP_TRACKING >> >> >Index: linux-2.6.git/include/linux/swapops.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/include/linux/swapops.h >> >> >+++ linux-2.6.git/include/linux/swapops.h >> >> >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent >> >> > swp_entry_t arch_entry; >> >> > >> >> > BUG_ON(pte_file(pte)); >> >> >+ if (pte_swp_soft_dirty(pte)) >> >> >+ pte = pte_swp_clear_soft_dirty(pte); >> >> > arch_entry = __pte_to_swp_entry(pte); >> >> > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); >> >> > } >> >> >Index: linux-2.6.git/mm/memory.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/mm/memory.c >> >> >+++ linux-2.6.git/mm/memory.c >> >> >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct >> >> > exclusive = 1; >> >> > } >> >> > flush_icache_page(vma, page); >> >> >+ if (pte_swp_soft_dirty(orig_pte)) >> >> >+ pte = pte_mksoft_dirty(pte); >> >> >> >> entry = pte_to_swp_entry(orig_pte); >> >> orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. >> > >> >You seem to walk same way with me. >> >Please look at my stupid questions in this thread. >> > >> >> I see your discussion with Cyrill, however, pte_to_swp_entry and pte_swp_soft_dirty >> both against orig_pte, where I miss? ;-) > >pte_to_swp_entry is passed orig_pte by vaule, not a pointer >so although pte_to_swp_entry clear out _PTE_SWP_SOFT_DIRTY, it does it in local-copy. >So orig_pte is never changed. Ouch! Thanks for pointing out. ;-) Reviewed-by: Wanpeng Li > >> >> >> >> >> > set_pte_at(mm, address, page_table, pte); >> >> > if (page == swapcache) >> >> > do_page_add_anon_rmap(page, vma, address, exclusive); >> >> >Index: linux-2.6.git/mm/rmap.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/mm/rmap.c >> >> >+++ linux-2.6.git/mm/rmap.c >> >> >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, >> >> > swp_entry_to_pte(make_hwpoison_entry(page))); >> >> > } else if (PageAnon(page)) { >> >> > swp_entry_t entry = { .val = page_private(page) }; >> >> >+ pte_t swp_pte; >> >> > >> >> > if (PageSwapCache(page)) { >> >> > /* >> >> >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, >> >> > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); >> >> > entry = make_migration_entry(page, pte_write(pteval)); >> >> > } >> >> >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); >> >> >+ swp_pte = swp_entry_to_pte(entry); >> >> >+ if (pte_soft_dirty(pteval)) >> >> >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); >> >> >+ set_pte_at(mm, address, pte, swp_pte); >> >> > BUG_ON(pte_file(*pte)); >> >> > } else if (IS_ENABLED(CONFIG_MIGRATION) && >> >> > (TTU_ACTION(flags) == TTU_MIGRATION)) { >> >> >Index: linux-2.6.git/mm/swapfile.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/mm/swapfile.c >> >> >+++ linux-2.6.git/mm/swapfile.c >> >> >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, >> >> > } >> >> > #endif /* CONFIG_HIBERNATION */ >> >> > >> >> >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) >> >> >+{ >> >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >> >+ /* >> >> >+ * When pte keeps soft dirty bit the pte generated >> >> >+ * from swap entry does not has it, still it's same >> >> >+ * pte from logical point of view. >> >> >+ */ >> >> >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); >> >> >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); >> >> >+#else >> >> >+ return pte_same(pte, swp_pte); >> >> >+#endif >> >> >+} >> >> >+ >> >> > /* >> >> > * No need to decide whether this PTE shares the swap entry with others, >> >> > * just let do_wp_page work it out if a write is requested later - to >> >> >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru >> >> > } >> >> > >> >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> >> >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { >> >> >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { >> >> > mem_cgroup_cancel_charge_swapin(memcg); >> >> > ret = 0; >> >> > goto out; >> >> >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are >> >> > * swapoff spends a _lot_ of time in this loop! >> >> > * Test inline before going to call unuse_pte. >> >> > */ >> >> >- if (unlikely(pte_same(*pte, swp_pte))) { >> >> >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { >> >> > pte_unmap(pte); >> >> > ret = unuse_pte(vma, pmd, addr, entry, page); >> >> > if (ret) >> >> > >> >> >-- >> >> >To unsubscribe, send a message with 'unsubscribe linux-mm' in >> >> >the body to majordomo@kvack.org. For more info on Linux MM, >> >> >see: http://www.linux-mm.org/ . >> >> >Don't email: email@kvack.org >> >> >> >> -- >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in >> >> the body to majordomo@kvack.org. For more info on Linux MM, >> >> see: http://www.linux-mm.org/ . >> >> Don't email: email@kvack.org >> > >> >-- >> >Kind regards, >> >Minchan Kim >> >> -- >> To unsubscribe, send a message with 'unsubscribe linux-mm' in >> the body to majordomo@kvack.org. For more info on Linux MM, >> see: http://www.linux-mm.org/ . >> Don't email: email@kvack.org > >-- >Kind regards, >Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx112.postini.com [74.125.245.112]) by kanga.kvack.org (Postfix) with SMTP id 7B3B76B0031 for ; Mon, 5 Aug 2013 01:43:39 -0400 (EDT) Received: by mail-la0-f50.google.com with SMTP id fn20so1726588lab.9 for ; Sun, 04 Aug 2013 22:43:37 -0700 (PDT) Date: Mon, 5 Aug 2013 09:43:35 +0400 From: Cyrill Gorcunov Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805054335.GC7999@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> <20130805025437.GK32486@bbox> <51ff14e9.87ef440a.1424.ffffe470SMTPIN_ADDED_BROKEN@mx.google.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51ff14e9.87ef440a.1424.ffffe470SMTPIN_ADDED_BROKEN@mx.google.com> Sender: owner-linux-mm@kvack.org List-ID: To: Wanpeng Li Cc: Minchan Kim , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Mon, Aug 05, 2013 at 10:58:35AM +0800, Wanpeng Li wrote: > > > >pte_to_swp_entry is passed orig_pte by vaule, not a pointer > >so although pte_to_swp_entry clear out _PTE_SWP_SOFT_DIRTY, it does it in local-copy. > >So orig_pte is never changed. > > Ouch! Thanks for pointing out. ;-) > > Reviewed-by: Wanpeng Li Yeah, it's a bit tricky. Thanks. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx194.postini.com [74.125.245.194]) by kanga.kvack.org (Postfix) with SMTP id 2C0AC6B0033 for ; Wed, 7 Aug 2013 16:21:58 -0400 (EDT) Date: Wed, 7 Aug 2013 13:21:56 -0700 From: Andrew Morton Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-Id: <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> In-Reply-To: <20130730204654.844299768@gmail.com> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Wed, 31 Jul 2013 00:41:55 +0400 Cyrill Gorcunov wrote: > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > bit set get swapped out, the bit is getting lost and no longer > available when pte read back. > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > saved in pte entry for the page being swapped out. When such page > is to be read back from a swap cache we check for bit presence > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > bit back. > > One of the problem was to find a place in pte entry where we can > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > _PAGE_PSE was chosen for that, it doesn't intersect with swap > entry format stored in pte. So the implication is that if another architecture wants to support this (and, realistically, wants to support CRIU), that architecture must find a spare pte bit to implement _PTE_SWP_SOFT_DIRTY. Yes? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx166.postini.com [74.125.245.166]) by kanga.kvack.org (Postfix) with SMTP id 8746A6B0033 for ; Wed, 7 Aug 2013 16:28:14 -0400 (EDT) Date: Wed, 7 Aug 2013 13:28:12 -0700 From: Andrew Morton Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-Id: <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> In-Reply-To: <20130730204654.966378702@gmail.com> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Wed, 31 Jul 2013 00:41:56 +0400 Cyrill Gorcunov wrote: > +#define pte_to_pgoff(pte) \ > + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ > + & ((1U << PTE_FILE_BITS1) - 1))) \ > + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ > + & ((1U << PTE_FILE_BITS2) - 1)) \ > + << (PTE_FILE_BITS1)) \ > + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ > + & ((1U << PTE_FILE_BITS3) - 1)) \ > + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ > + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ > + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) > + > +#define pgoff_to_pte(off) \ > + ((pte_t) { .pte_low = \ > + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ > + + ((((off) >> PTE_FILE_BITS1) \ > + & ((1U << PTE_FILE_BITS2) - 1)) \ > + << PTE_FILE_SHIFT2) \ > + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ > + & ((1U << PTE_FILE_BITS3) - 1)) \ > + << PTE_FILE_SHIFT3) \ > + + ((((off) >> \ > + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ > + << PTE_FILE_SHIFT4) \ > + + _PAGE_FILE }) Good god. I wonder if these can be turned into out-of-line functions in some form which humans can understand. or #define pte_to_pgoff(pte) frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx172.postini.com [74.125.245.172]) by kanga.kvack.org (Postfix) with SMTP id D0AB86B0033 for ; Wed, 7 Aug 2013 16:29:17 -0400 (EDT) Received: by mail-lb0-f174.google.com with SMTP id w20so1879424lbh.19 for ; Wed, 07 Aug 2013 13:29:15 -0700 (PDT) Date: Thu, 8 Aug 2013 00:29:14 +0400 From: Cyrill Gorcunov Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130807202914.GO7999@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> Sender: owner-linux-mm@kvack.org List-ID: To: Andrew Morton Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Wed, Aug 07, 2013 at 01:21:56PM -0700, Andrew Morton wrote: > > > > One of the problem was to find a place in pte entry where we can > > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > > _PAGE_PSE was chosen for that, it doesn't intersect with swap > > entry format stored in pte. > > So the implication is that if another architecture wants to support > this (and, realistically, wants to support CRIU), that architecture > must find a spare pte bit to implement _PTE_SWP_SOFT_DIRTY. Yes? Exactly. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx148.postini.com [74.125.245.148]) by kanga.kvack.org (Postfix) with SMTP id EF0D76B0032 for ; Wed, 7 Aug 2013 16:31:06 -0400 (EDT) Received: by mail-la0-f49.google.com with SMTP id ev20so1528687lab.8 for ; Wed, 07 Aug 2013 13:31:05 -0700 (PDT) Date: Thu, 8 Aug 2013 00:31:03 +0400 From: Cyrill Gorcunov Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130807203103.GP7999@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> Sender: owner-linux-mm@kvack.org List-ID: To: Andrew Morton Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Wed, Aug 07, 2013 at 01:28:12PM -0700, Andrew Morton wrote: > > Good god. > > I wonder if these can be turned into out-of-line functions in some form > which humans can understand. > > or > > #define pte_to_pgoff(pte) > frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + > frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + > frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + > frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) I copied this code from existing one, not mine invention ;) I'll clean it up on top. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx164.postini.com [74.125.245.164]) by kanga.kvack.org (Postfix) with SMTP id 648EC6B0031 for ; Thu, 8 Aug 2013 13:49:12 -0400 (EDT) Received: by mail-la0-f46.google.com with SMTP id eh20so2319439lab.5 for ; Thu, 08 Aug 2013 10:49:10 -0700 (PDT) Date: Thu, 8 Aug 2013 18:51:20 +0400 From: Cyrill Gorcunov Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130808145120.GA1775@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="r5Pyd7+fXNt84Ff3" Content-Disposition: inline In-Reply-To: <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> Sender: owner-linux-mm@kvack.org List-ID: To: Andrew Morton Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com --r5Pyd7+fXNt84Ff3 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline On Wed, Aug 07, 2013 at 01:28:12PM -0700, Andrew Morton wrote: > > Good god. > > I wonder if these can be turned into out-of-line functions in some form > which humans can understand. > > or > > #define pte_to_pgoff(pte) > frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + > frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + > frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + > frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) Hi, here is what I ended up with. Please take a look (I decided to post patch in the thread since it's related to the context of the mails). --r5Pyd7+fXNt84Ff3 Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=pte-sft-dirty-file-cleanup-2 From: Cyrill Gorcunov Subject: mm: Cleanup pte_to_pgoff and pgoff_to_pte helpers Andrew asked if there a way to make pte_to_pgoff and pgoff_to_pte macro helpers somehow more readable. With this patch it should be more understandable what is happening with bits when they come to and from pte entry. Signed-off-by: Cyrill Gorcunov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" --- Guys, is there a reason for "if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE" test present in this pgtable-2level.h file at all? I can't imagine where it can be false on x86. arch/x86/include/asm/pgtable-2level.h | 82 +++++++++++++++++----------------- 1 file changed, 41 insertions(+), 41 deletions(-) Index: linux-2.6.git/arch/x86/include/asm/pgtable-2level.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable-2level.h +++ linux-2.6.git/arch/x86/include/asm/pgtable-2level.h @@ -55,6 +55,9 @@ static inline pmd_t native_pmdp_get_and_ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) +#define __frob(v,r,l) (((v) >> (r)) << (l)) + #ifdef CONFIG_MEM_SOFT_DIRTY /* @@ -71,31 +74,27 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ - & ((1U << PTE_FILE_BITS1) - 1))) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << (PTE_FILE_BITS1)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << PTE_FILE_SHIFT3) \ - + ((((off) >> \ - (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ - << PTE_FILE_SHIFT4) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) +#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) +#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) + +#define pte_to_pgoff(pte) \ + (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + \ + __frob((pte).pte_low, PTE_FILE_SHIFT4, PTE_FILE_LSHIFT4)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + _mfrob(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + \ + __frob(off, PTE_FILE_LSHIFT4, PTE_FILE_SHIFT4) + \ + _PAGE_FILE }) #else /* CONFIG_MEM_SOFT_DIRTY */ @@ -115,22 +114,23 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> PTE_FILE_SHIFT1) \ - & ((1U << PTE_FILE_BITS1) - 1)) \ - + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ - & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ - + (((pte).pte_low >> PTE_FILE_SHIFT3) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - << PTE_FILE_SHIFT3) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) + +#define pte_to_pgoff(pte) \ + (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + __frob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_LSHIFT3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + __frob(off, PTE_FILE_LSHIFT3, PTE_FILE_SHIFT3) + \ + _PAGE_FILE }) #endif /* CONFIG_MEM_SOFT_DIRTY */ --r5Pyd7+fXNt84Ff3-- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx186.postini.com [74.125.245.186]) by kanga.kvack.org (Postfix) with SMTP id 586CB6B0031 for ; Sat, 10 Aug 2013 13:48:27 -0400 (EDT) Message-ID: <1376156903.2156.30.camel@dabdike.int.hansenpartnership.com> Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages From: James Bottomley Date: Sat, 10 Aug 2013 10:48:23 -0700 In-Reply-To: <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> Content-Type: text/plain; charset="ISO-8859-15" Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Andrew Morton Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Wed, 2013-08-07 at 13:21 -0700, Andrew Morton wrote: > On Wed, 31 Jul 2013 00:41:55 +0400 Cyrill Gorcunov wrote: > > > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > > bit set get swapped out, the bit is getting lost and no longer > > available when pte read back. > > > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > > saved in pte entry for the page being swapped out. When such page > > is to be read back from a swap cache we check for bit presence > > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > > bit back. > > > > One of the problem was to find a place in pte entry where we can > > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > > _PAGE_PSE was chosen for that, it doesn't intersect with swap > > entry format stored in pte. > > So the implication is that if another architecture wants to support > this (and, realistically, wants to support CRIU), To be clear, CRIU is usable for basic checkpoint/restore without soft dirty. It's using CRIU as an engine for process migration between nodes that won't work efficiently without soft dirty. What happens without soft dirty is that we have to freeze the source process state, transfer the bits and then begin execution on the target ... that means the process can be suspended for minutes (and means that customers notice and your SLAs get blown). Using soft dirty, we can iteratively build up the process image on the target while the source process is still executing meaning the actual transfer between source and target takes only seconds (when the delta is small enough, we freeze the source, transfer the remaining changed bits and begin on the target). James -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx146.postini.com [74.125.245.146]) by kanga.kvack.org (Postfix) with SMTP id C12886B0062 for ; Mon, 12 Aug 2013 17:57:22 -0400 (EDT) Date: Mon, 12 Aug 2013 14:57:20 -0700 From: Andrew Morton Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-Id: <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> In-Reply-To: <20130808145120.GA1775@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner On Thu, 8 Aug 2013 18:51:20 +0400 Cyrill Gorcunov wrote: > On Wed, Aug 07, 2013 at 01:28:12PM -0700, Andrew Morton wrote: > > > > Good god. > > > > I wonder if these can be turned into out-of-line functions in some form > > which humans can understand. > > > > or > > > > #define pte_to_pgoff(pte) > > frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + > > frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + > > frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + > > frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) > > Hi, here is what I ended up with. Please take a look (I decided to post > patch in the thread since it's related to the context of the mails). You could have #undefed _mfrob and __frob after using them, but whatever. I saved this patch to wave at the x86 guys for 3.12. I plan to merge mm-save-soft-dirty-bits-on-file-pages.patch for 3.11. > Guys, is there a reason for "if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE" > test present in this pgtable-2level.h file at all? I can't imagine > where it can be false on x86. I doubt if "Guys" read this. x86 maintainers cc'ed. From: Cyrill Gorcunov Subject: arch/x86/include/asm/pgtable-2level.h: clean up pte_to_pgoff and pgoff_to_pte helpers Andrew asked if there a way to make pte_to_pgoff and pgoff_to_pte macro helpers somehow more readable. With this patch it should be more understandable what is happening with bits when they come to and from pte entry. Signed-off-by: Cyrill Gorcunov Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable-2level.h | 82 ++++++++++++------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff -puN arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers arch/x86/include/asm/pgtable-2level.h --- a/arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers +++ a/arch/x86/include/asm/pgtable-2level.h @@ -55,6 +55,9 @@ static inline pmd_t native_pmdp_get_and_ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) +#define __frob(v,r,l) (((v) >> (r)) << (l)) + #ifdef CONFIG_MEM_SOFT_DIRTY /* @@ -71,31 +74,27 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ - & ((1U << PTE_FILE_BITS1) - 1))) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << (PTE_FILE_BITS1)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << PTE_FILE_SHIFT3) \ - + ((((off) >> \ - (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ - << PTE_FILE_SHIFT4) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) +#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) +#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) + +#define pte_to_pgoff(pte) \ + (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + \ + __frob((pte).pte_low, PTE_FILE_SHIFT4, PTE_FILE_LSHIFT4)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + _mfrob(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + \ + __frob(off, PTE_FILE_LSHIFT4, PTE_FILE_SHIFT4) + \ + _PAGE_FILE }) #else /* CONFIG_MEM_SOFT_DIRTY */ @@ -115,22 +114,23 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> PTE_FILE_SHIFT1) \ - & ((1U << PTE_FILE_BITS1) - 1)) \ - + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ - & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ - + (((pte).pte_low >> PTE_FILE_SHIFT3) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - << PTE_FILE_SHIFT3) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) + +#define pte_to_pgoff(pte) \ + (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + __frob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_LSHIFT3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + __frob(off, PTE_FILE_LSHIFT3, PTE_FILE_SHIFT3) + \ + _PAGE_FILE }) #endif /* CONFIG_MEM_SOFT_DIRTY */ _ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx175.postini.com [74.125.245.175]) by kanga.kvack.org (Postfix) with SMTP id 592EE6B005A for ; Mon, 12 Aug 2013 18:28:37 -0400 (EDT) Received: by mail-vc0-f182.google.com with SMTP id hf12so3303853vcb.27 for ; Mon, 12 Aug 2013 15:28:36 -0700 (PDT) MIME-Version: 1.0 In-Reply-To: <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> From: Andy Lutomirski Date: Mon, 12 Aug 2013 15:28:06 -0700 Message-ID: Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Content-Type: text/plain; charset=ISO-8859-1 Sender: owner-linux-mm@kvack.org List-ID: To: Andrew Morton Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner On Mon, Aug 12, 2013 at 2:57 PM, Andrew Morton wrote: > On Thu, 8 Aug 2013 18:51:20 +0400 Cyrill Gorcunov wrote: > >> On Wed, Aug 07, 2013 at 01:28:12PM -0700, Andrew Morton wrote: >> > >> > Good god. >> > >> > I wonder if these can be turned into out-of-line functions in some form >> > which humans can understand. >> > >> > or >> > >> > #define pte_to_pgoff(pte) >> > frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + >> > frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + >> > frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + >> > frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) >> >> Hi, here is what I ended up with. Please take a look (I decided to post >> patch in the thread since it's related to the context of the mails). > > You could have #undefed _mfrob and __frob after using them, but whatever. > > I saved this patch to wave at the x86 guys for 3.12. I plan to merge > mm-save-soft-dirty-bits-on-file-pages.patch for 3.11. > >> Guys, is there a reason for "if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE" >> test present in this pgtable-2level.h file at all? I can't imagine >> where it can be false on x86. > > I doubt if "Guys" read this. x86 maintainers cc'ed. > > > > > > From: Cyrill Gorcunov > Subject: arch/x86/include/asm/pgtable-2level.h: clean up pte_to_pgoff and pgoff_to_pte helpers > > Andrew asked if there a way to make pte_to_pgoff and pgoff_to_pte macro > helpers somehow more readable. > > With this patch it should be more understandable what is happening with > bits when they come to and from pte entry. > > Signed-off-by: Cyrill Gorcunov > Cc: Ingo Molnar > Cc: "H. Peter Anvin" > Cc: Thomas Gleixner > Signed-off-by: Andrew Morton > --- > > arch/x86/include/asm/pgtable-2level.h | 82 ++++++++++++------------ > 1 file changed, 41 insertions(+), 41 deletions(-) > > diff -puN arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers arch/x86/include/asm/pgtable-2level.h > --- a/arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers > +++ a/arch/x86/include/asm/pgtable-2level.h > @@ -55,6 +55,9 @@ static inline pmd_t native_pmdp_get_and_ > #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) > #endif > > +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) > +#define __frob(v,r,l) (((v) >> (r)) << (l)) > + > #ifdef CONFIG_MEM_SOFT_DIRTY > If I'm understanding this right, the idea is to take the bits in the range a..b of v and stick them at c..d, where a-b == c-d. Would it make sense to change this to look something like #define __frob(v, inmsb, inlsb, outlsb) ((v >> inlsb) & ((1<<(inmsb - inlsb + 1)-1) << outlsb) For extra fun, there could be an __unfrob macro that takes the same inmsg, inlsb, outlsb parameters but undoes it so that it's (more) clear that the operations that are supposed to be inverses are indeed inverses. --Andy -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx173.postini.com [74.125.245.173]) by kanga.kvack.org (Postfix) with SMTP id 8806F6B0062 for ; Mon, 12 Aug 2013 18:37:27 -0400 (EDT) Date: Mon, 12 Aug 2013 15:37:25 -0700 From: Andrew Morton Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-Id: <20130812153725.6ac5135a86994e4d766723f9@linux-foundation.org> In-Reply-To: References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Andy Lutomirski Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner On Mon, 12 Aug 2013 15:28:06 -0700 Andy Lutomirski wrote: > > +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) > > +#define __frob(v,r,l) (((v) >> (r)) << (l)) > > + > > #ifdef CONFIG_MEM_SOFT_DIRTY > > > > If I'm understanding this right, the idea is to take the bits in the > range a..b of v and stick them at c..d, where a-b == c-d. Would it > make sense to change this to look something like > > #define __frob(v, inmsb, inlsb, outlsb) ((v >> inlsb) & ((1<<(inmsb - > inlsb + 1)-1) << outlsb) > > For extra fun, there could be an __unfrob macro that takes the same > inmsg, inlsb, outlsb parameters but undoes it so that it's (more) > clear that the operations that are supposed to be inverses are indeed > inverses. hm, I seem to remember writing drivers/net/ethernet/3com/3c59x.c:BFINS() and BFEXT() shortly after the invention of the electronic computer. I'm kinda surprised that we don't already have something like this in kernel.h or somewhere - there's surely a ton of code which does such things. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx118.postini.com [74.125.245.118]) by kanga.kvack.org (Postfix) with SMTP id 524706B0032 for ; Tue, 13 Aug 2013 01:02:17 -0400 (EDT) Received: by mail-la0-f52.google.com with SMTP id fq13so5365895lab.25 for ; Mon, 12 Aug 2013 22:02:15 -0700 (PDT) Date: Tue, 13 Aug 2013 09:02:13 +0400 From: Cyrill Gorcunov Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130813050213.GA2869@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: Sender: owner-linux-mm@kvack.org List-ID: To: Andy Lutomirski Cc: Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner On Mon, Aug 12, 2013 at 03:28:06PM -0700, Andy Lutomirski wrote: > > > > You could have #undefed _mfrob and __frob after using them, but whatever. Sure, for some reason I forgot to do that. Will send update on top. > > I saved this patch to wave at the x86 guys for 3.12. I plan to merge > > mm-save-soft-dirty-bits-on-file-pages.patch for 3.11. > > > >> Guys, is there a reason for "if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE" > >> test present in this pgtable-2level.h file at all? I can't imagine > >> where it can be false on x86. > > > > I doubt if "Guys" read this. x86 maintainers cc'ed. Thanks! > > +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) > > +#define __frob(v,r,l) (((v) >> (r)) << (l)) > > + > > #ifdef CONFIG_MEM_SOFT_DIRTY > > If I'm understanding this right, the idea is to take the bits in the > range a..b of v and stick them at c..d, where a-b == c-d. Would it > make sense to change this to look something like > > #define __frob(v, inmsb, inlsb, outlsb) ((v >> inlsb) & ((1<<(inmsb - > inlsb + 1)-1) << outlsb) There is a case when you don't need a mask completely. And because this pte conversion is on hot path and time critical I kept generated code as it was (even if that lead to slightly less clear source code). > For extra fun, there could be an __unfrob macro that takes the same > inmsg, inlsb, outlsb parameters but undoes it so that it's (more) > clear that the operations that are supposed to be inverses are indeed > inverses. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx156.postini.com [74.125.245.156]) by kanga.kvack.org (Postfix) with SMTP id 033196B0033 for ; Tue, 13 Aug 2013 11:15:36 -0400 (EDT) Message-ID: <520A4D5F.6020401@zytor.com> Date: Tue, 13 Aug 2013 08:14:39 -0700 From: "H. Peter Anvin" MIME-Version: 1.0 Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> In-Reply-To: <20130813050213.GA2869@moon> Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner On 08/12/2013 10:02 PM, Cyrill Gorcunov wrote: > > There is a case when you don't need a mask completely. And because this > pte conversion is on hot path and time critical I kept generated code > as it was (even if that lead to slightly less clear source code). > Does it actually matter, generated-code-wise, or is the compiler smart enough to figure it out? The reason I'm asking is because it makes the code much harder to follow. The other thing is can we please pretty please call it something other than "frob"? -hpa -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx123.postini.com [74.125.245.123]) by kanga.kvack.org (Postfix) with SMTP id B375F6B0033 for ; Tue, 13 Aug 2013 11:37:06 -0400 (EDT) Received: by mail-la0-f45.google.com with SMTP id fj20so5955124lab.32 for ; Tue, 13 Aug 2013 08:37:04 -0700 (PDT) Date: Tue, 13 Aug 2013 19:37:03 +0400 From: Cyrill Gorcunov Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130813153703.GE2869@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> <520A4D5F.6020401@zytor.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <520A4D5F.6020401@zytor.com> Sender: owner-linux-mm@kvack.org List-ID: To: "H. Peter Anvin" Cc: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner On Tue, Aug 13, 2013 at 08:14:39AM -0700, H. Peter Anvin wrote: > On 08/12/2013 10:02 PM, Cyrill Gorcunov wrote: > > > > There is a case when you don't need a mask completely. And because this > > pte conversion is on hot path and time critical I kept generated code > > as it was (even if that lead to slightly less clear source code). > > > > Does it actually matter, generated-code-wise, or is the compiler smart > enough to figure it out? The reason I'm asking is because it makes the gcc-4.7.2 is smart enough to suppress useless masking (ie ((1u << 31) - 1)) completely but I don't know if this can be assumed for all gcc series. > code much harder to follow. I see. OK, I'll try to prepare more readable macro helpers. > > The other thing is can we please pretty please call it something other > than "frob"? Sure. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx119.postini.com [74.125.245.119]) by kanga.kvack.org (Postfix) with SMTP id 20FC56B0032 for ; Tue, 13 Aug 2013 12:44:09 -0400 (EDT) Message-ID: <520A622B.7020900@zytor.com> Date: Tue, 13 Aug 2013 09:43:23 -0700 From: "H. Peter Anvin" MIME-Version: 1.0 Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> <520A4D5F.6020401@zytor.com> <20130813153703.GE2869@moon> In-Reply-To: <20130813153703.GE2869@moon> Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner On 08/13/2013 08:37 AM, Cyrill Gorcunov wrote: >> >> Does it actually matter, generated-code-wise, or is the compiler smart >> enough to figure it out? The reason I'm asking is because it makes the > > gcc-4.7.2 is smart enough to suppress useless masking (ie ((1u << 31) - 1)) > completely but I don't know if this can be assumed for all gcc series. > I would be highly surprised if it wasn't the case for any gcc we care about. -hpa -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from psmtp.com (na3sys010amx185.postini.com [74.125.245.185]) by kanga.kvack.org (Postfix) with SMTP id 067676B0034 for ; Tue, 13 Aug 2013 17:29:00 -0400 (EDT) Received: by mail-lb0-f179.google.com with SMTP id v1so6216801lbd.38 for ; Tue, 13 Aug 2013 14:28:59 -0700 (PDT) Date: Wed, 14 Aug 2013 01:28:57 +0400 From: Cyrill Gorcunov Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130813212857.GI2869@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> <520A4D5F.6020401@zytor.com> <20130813153703.GE2869@moon> <520A622B.7020900@zytor.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <520A622B.7020900@zytor.com> Sender: owner-linux-mm@kvack.org List-ID: To: "H. Peter Anvin" Cc: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner On Tue, Aug 13, 2013 at 09:43:23AM -0700, H. Peter Anvin wrote: > On 08/13/2013 08:37 AM, Cyrill Gorcunov wrote: > >> > >> Does it actually matter, generated-code-wise, or is the compiler smart > >> enough to figure it out? The reason I'm asking is because it makes the > > > > gcc-4.7.2 is smart enough to suppress useless masking (ie ((1u << 31) - 1)) > > completely but I don't know if this can be assumed for all gcc series. > > > > I would be highly surprised if it wasn't the case for any gcc we care about. Does below one looks better? (Btw, what about the snippet we have there as well #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) #else #define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) #define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) #endif where #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL -> 8 #define _PAGE_BIT_FILE _PAGE_BIT_DIRTY -> 6 so I wonder where the cases on x86 when _PAGE_BIT_FILE > _PAGE_BIT_PROTNONE, what i'm missing here?) --- arch/x86/include/asm/pgtable-2level.h | 37 +++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) Index: linux-2.6.git/arch/x86/include/asm/pgtable-2level.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable-2level.h +++ linux-2.6.git/arch/x86/include/asm/pgtable-2level.h @@ -55,8 +55,11 @@ static inline pmd_t native_pmdp_get_and_ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif -#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) -#define __frob(v,r,l) (((v) >> (r)) << (l)) +/* + * For readable bitfield manipulations. + */ +#define PTE_FILE_NOMASK (-1U) +#define __bfop(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) #ifdef CONFIG_MEM_SOFT_DIRTY @@ -83,17 +86,17 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) #define pte_to_pgoff(pte) \ - (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ - _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ - _mfrob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + \ - __frob((pte).pte_low, PTE_FILE_SHIFT4, PTE_FILE_LSHIFT4)) + (__bfop((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT4, PTE_FILE_NOMASK, PTE_FILE_LSHIFT4)) #define pgoff_to_pte(off) \ ((pte_t) { .pte_low = \ - _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ - _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ - _mfrob(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + \ - __frob(off, PTE_FILE_LSHIFT4, PTE_FILE_SHIFT4) + \ + __bfop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + __bfop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + __bfop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + \ + __bfop(off, PTE_FILE_LSHIFT4, PTE_FILE_NOMASK, PTE_FILE_SHIFT4) + \ _PAGE_FILE }) #else /* CONFIG_MEM_SOFT_DIRTY */ @@ -121,19 +124,21 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) #define pte_to_pgoff(pte) \ - (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ - _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ - __frob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_LSHIFT3)) + (__bfop((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_NOMASK, PTE_FILE_LSHIFT3)) #define pgoff_to_pte(off) \ ((pte_t) { .pte_low = \ - _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ - _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ - __frob(off, PTE_FILE_LSHIFT3, PTE_FILE_SHIFT3) + \ + __bfop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + __bfop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + __bfop(off, PTE_FILE_LSHIFT3, PTE_FILE_NOMASK, PTE_FILE_SHIFT3) + \ _PAGE_FILE }) #endif /* CONFIG_MEM_SOFT_DIRTY */ +#undef __bfop + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: Wanpeng Li Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Date: Mon, 5 Aug 2013 09:48:29 +0800 Message-ID: <28384.3475314372$1375667332@news.gmane.org> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> Reply-To: Wanpeng Li Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Received: from kanga.kvack.org ([205.233.56.17]) by plane.gmane.org with esmtp (Exim 4.69) (envelope-from ) id 1V69ud-0002TV-VP for glkm-linux-mm-2@m.gmane.org; Mon, 05 Aug 2013 03:48:44 +0200 Received: from psmtp.com (na3sys010amx198.postini.com [74.125.245.198]) by kanga.kvack.org (Postfix) with SMTP id 0FCEB6B0031 for ; Sun, 4 Aug 2013 21:48:40 -0400 (EDT) Received: from /spool/local by e28smtp08.in.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 5 Aug 2013 07:08:14 +0530 Received: from d28relay04.in.ibm.com (d28relay04.in.ibm.com [9.184.220.61]) by d28dlp01.in.ibm.com (Postfix) with ESMTP id 8E77EE0053 for ; Mon, 5 Aug 2013 07:18:45 +0530 (IST) Received: from d28av05.in.ibm.com (d28av05.in.ibm.com [9.184.220.67]) by d28relay04.in.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r751mS5l39714820 for ; Mon, 5 Aug 2013 07:18:29 +0530 Received: from d28av05.in.ibm.com (localhost [127.0.0.1]) by d28av05.in.ibm.com (8.14.4/8.14.4/NCO v10.0 AVout) with ESMTP id r751mVjg023691 for ; Mon, 5 Aug 2013 07:18:31 +0530 Content-Disposition: inline In-Reply-To: <20130730204654.844299768@gmail.com> Sender: owner-linux-mm@kvack.org List-ID: To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY >bit set get swapped out, the bit is getting lost and no longer >available when pte read back. > >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is >saved in pte entry for the page being swapped out. When such page >is to be read back from a swap cache we check for bit presence >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY >bit back. > >One of the problem was to find a place in pte entry where we can >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The >_PAGE_PSE was chosen for that, it doesn't intersect with swap >entry format stored in pte. > >Reported-by: Andy Lutomirski >Signed-off-by: Cyrill Gorcunov >Cc: Pavel Emelyanov >Cc: Andrew Morton >Cc: Matt Mackall >Cc: Xiao Guangrong >Cc: Marcelo Tosatti >Cc: KOSAKI Motohiro >Cc: Stephen Rothwell >Cc: Peter Zijlstra >Cc: "Aneesh Kumar K.V" >--- > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > fs/proc/task_mmu.c | 21 +++++++++++++++------ > include/asm-generic/pgtable.h | 15 +++++++++++++++ > include/linux/swapops.h | 2 ++ > mm/memory.c | 2 ++ > mm/rmap.c | 6 +++++- > mm/swapfile.c | 19 +++++++++++++++++-- > 8 files changed, 84 insertions(+), 9 deletions(-) > >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h >=================================================================== >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > } > >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >+{ >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); >+} >+ >+static inline int pte_swp_soft_dirty(pte_t pte) >+{ >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; >+} >+ >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >+{ >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); >+} >+ > /* > * Mask out unsupported bits in a present pgprot. Non-present pgprots > * can use those bits for other purposes, so leave them be. >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h >=================================================================== >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h >@@ -67,6 +67,19 @@ > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > #endif > >+/* >+ * Tracking soft dirty bit when a page goes to a swap is tricky. >+ * We need a bit which can be stored in pte _and_ not conflict >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved >+ * into swap entry computation, but bit 6 is used for nonlinear >+ * file mapping, so we borrow bit 7 for soft dirty tracking. >+ */ >+#ifdef CONFIG_MEM_SOFT_DIRTY >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE >+#else >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) >+#endif >+ > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > #else >Index: linux-2.6.git/fs/proc/task_mmu.c >=================================================================== >--- linux-2.6.git.orig/fs/proc/task_mmu.c >+++ linux-2.6.git/fs/proc/task_mmu.c >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > * of how soft-dirty works. > */ > pte_t ptent = *pte; >- ptent = pte_wrprotect(ptent); >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >+ >+ if (pte_present(ptent)) { >+ ptent = pte_wrprotect(ptent); >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >+ } else if (is_swap_pte(ptent)) { >+ ptent = pte_swp_clear_soft_dirty(ptent); >+ } >+ > set_pte_at(vma->vm_mm, addr, pte, ptent); > #endif > } >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > for (; addr != end; pte++, addr += PAGE_SIZE) { > ptent = *pte; >- if (!pte_present(ptent)) >- continue; > > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > clear_soft_dirty(vma, addr, pte); > continue; > } > >+ if (!pte_present(ptent)) >+ continue; >+ > page = vm_normal_page(vma, addr, ptent); > if (!page) > continue; >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > flags = PM_PRESENT; > page = vm_normal_page(vma, addr, pte); > } else if (is_swap_pte(pte)) { >- swp_entry_t entry = pte_to_swp_entry(pte); >- >+ swp_entry_t entry; >+ if (pte_swp_soft_dirty(pte)) >+ flags2 |= __PM_SOFT_DIRTY; >+ entry = pte_to_swp_entry(pte); > frame = swp_type(entry) | > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > flags = PM_SWAP; >Index: linux-2.6.git/include/asm-generic/pgtable.h >=================================================================== >--- linux-2.6.git.orig/include/asm-generic/pgtable.h >+++ linux-2.6.git/include/asm-generic/pgtable.h >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > { > return pmd; > } >+ >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >+{ >+ return pte; >+} >+ >+static inline int pte_swp_soft_dirty(pte_t pte) >+{ >+ return 0; >+} >+ >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >+{ >+ return pte; >+} > #endif > > #ifndef __HAVE_PFNMAP_TRACKING >Index: linux-2.6.git/include/linux/swapops.h >=================================================================== >--- linux-2.6.git.orig/include/linux/swapops.h >+++ linux-2.6.git/include/linux/swapops.h >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > swp_entry_t arch_entry; > > BUG_ON(pte_file(pte)); >+ if (pte_swp_soft_dirty(pte)) >+ pte = pte_swp_clear_soft_dirty(pte); > arch_entry = __pte_to_swp_entry(pte); > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > } >Index: linux-2.6.git/mm/memory.c >=================================================================== >--- linux-2.6.git.orig/mm/memory.c >+++ linux-2.6.git/mm/memory.c >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > exclusive = 1; > } > flush_icache_page(vma, page); >+ if (pte_swp_soft_dirty(orig_pte)) >+ pte = pte_mksoft_dirty(pte); entry = pte_to_swp_entry(orig_pte); orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. > set_pte_at(mm, address, page_table, pte); > if (page == swapcache) > do_page_add_anon_rmap(page, vma, address, exclusive); >Index: linux-2.6.git/mm/rmap.c >=================================================================== >--- linux-2.6.git.orig/mm/rmap.c >+++ linux-2.6.git/mm/rmap.c >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > swp_entry_to_pte(make_hwpoison_entry(page))); > } else if (PageAnon(page)) { > swp_entry_t entry = { .val = page_private(page) }; >+ pte_t swp_pte; > > if (PageSwapCache(page)) { > /* >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > entry = make_migration_entry(page, pte_write(pteval)); > } >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); >+ swp_pte = swp_entry_to_pte(entry); >+ if (pte_soft_dirty(pteval)) >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); >+ set_pte_at(mm, address, pte, swp_pte); > BUG_ON(pte_file(*pte)); > } else if (IS_ENABLED(CONFIG_MIGRATION) && > (TTU_ACTION(flags) == TTU_MIGRATION)) { >Index: linux-2.6.git/mm/swapfile.c >=================================================================== >--- linux-2.6.git.orig/mm/swapfile.c >+++ linux-2.6.git/mm/swapfile.c >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > } > #endif /* CONFIG_HIBERNATION */ > >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) >+{ >+#ifdef CONFIG_MEM_SOFT_DIRTY >+ /* >+ * When pte keeps soft dirty bit the pte generated >+ * from swap entry does not has it, still it's same >+ * pte from logical point of view. >+ */ >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); >+#else >+ return pte_same(pte, swp_pte); >+#endif >+} >+ > /* > * No need to decide whether this PTE shares the swap entry with others, > * just let do_wp_page work it out if a write is requested later - to >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > } > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > mem_cgroup_cancel_charge_swapin(memcg); > ret = 0; > goto out; >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > * swapoff spends a _lot_ of time in this loop! > * Test inline before going to call unuse_pte. > */ >- if (unlikely(pte_same(*pte, swp_pte))) { >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { > pte_unmap(pte); > ret = unuse_pte(vma, pmd, addr, entry, page); > if (ret) > >-- >To unsubscribe, send a message with 'unsubscribe linux-mm' in >the body to majordomo@kvack.org. For more info on Linux MM, >see: http://www.linux-mm.org/ . >Don't email: email@kvack.org -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: Wanpeng Li Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Date: Mon, 5 Aug 2013 10:38:58 +0800 Message-ID: <19266.5442395539$1375670363@news.gmane.org> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> Reply-To: Wanpeng Li Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Received: from kanga.kvack.org ([205.233.56.17]) by plane.gmane.org with esmtp (Exim 4.69) (envelope-from ) id 1V6AhV-0003xF-U7 for glkm-linux-mm-2@m.gmane.org; Mon, 05 Aug 2013 04:39:14 +0200 Received: from psmtp.com (na3sys010amx168.postini.com [74.125.245.168]) by kanga.kvack.org (Postfix) with SMTP id 4C5EA6B0031 for ; Sun, 4 Aug 2013 22:39:11 -0400 (EDT) Received: from /spool/local by e23smtp03.au.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 5 Aug 2013 12:28:34 +1000 Received: from d23relay04.au.ibm.com (d23relay04.au.ibm.com [9.190.234.120]) by d23dlp02.au.ibm.com (Postfix) with ESMTP id 743B22BB0055 for ; Mon, 5 Aug 2013 12:39:02 +1000 (EST) Received: from d23av01.au.ibm.com (d23av01.au.ibm.com [9.190.234.96]) by d23relay04.au.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r752NOSF54722630 for ; Mon, 5 Aug 2013 12:23:24 +1000 Received: from d23av01.au.ibm.com (localhost [127.0.0.1]) by d23av01.au.ibm.com (8.14.4/8.14.4/NCO v10.0 AVout) with ESMTP id r752d0n2018203 for ; Mon, 5 Aug 2013 12:39:01 +1000 Content-Disposition: inline In-Reply-To: <20130805021715.GJ32486@bbox> Sender: owner-linux-mm@kvack.org List-ID: To: Minchan Kim Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Hi Minchan, On Mon, Aug 05, 2013 at 11:17:15AM +0900, Minchan Kim wrote: >Hello Wanpeng, > >On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: >> On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: >> >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY >> >bit set get swapped out, the bit is getting lost and no longer >> >available when pte read back. >> > >> >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is >> >saved in pte entry for the page being swapped out. When such page >> >is to be read back from a swap cache we check for bit presence >> >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY >> >bit back. >> > >> >One of the problem was to find a place in pte entry where we can >> >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The >> >_PAGE_PSE was chosen for that, it doesn't intersect with swap >> >entry format stored in pte. >> > >> >Reported-by: Andy Lutomirski >> >Signed-off-by: Cyrill Gorcunov >> >Cc: Pavel Emelyanov >> >Cc: Andrew Morton >> >Cc: Matt Mackall >> >Cc: Xiao Guangrong >> >Cc: Marcelo Tosatti >> >Cc: KOSAKI Motohiro >> >Cc: Stephen Rothwell >> >Cc: Peter Zijlstra >> >Cc: "Aneesh Kumar K.V" >> >--- >> > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ >> > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ >> > fs/proc/task_mmu.c | 21 +++++++++++++++------ >> > include/asm-generic/pgtable.h | 15 +++++++++++++++ >> > include/linux/swapops.h | 2 ++ >> > mm/memory.c | 2 ++ >> > mm/rmap.c | 6 +++++- >> > mm/swapfile.c | 19 +++++++++++++++++-- >> > 8 files changed, 84 insertions(+), 9 deletions(-) >> > >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h >> >=================================================================== >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h >> >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); >> > } >> > >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >+{ >> >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >+} >> >+ >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >+{ >> >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; >> >+} >> >+ >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >+{ >> >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >+} >> >+ >> > /* >> > * Mask out unsupported bits in a present pgprot. Non-present pgprots >> > * can use those bits for other purposes, so leave them be. >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >=================================================================== >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >@@ -67,6 +67,19 @@ >> > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) >> > #endif >> > >> >+/* >> >+ * Tracking soft dirty bit when a page goes to a swap is tricky. >> >+ * We need a bit which can be stored in pte _and_ not conflict >> >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved >> >+ * into swap entry computation, but bit 6 is used for nonlinear >> >+ * file mapping, so we borrow bit 7 for soft dirty tracking. >> >+ */ >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE >> >+#else >> >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) >> >+#endif >> >+ >> > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) >> > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) >> > #else >> >Index: linux-2.6.git/fs/proc/task_mmu.c >> >=================================================================== >> >--- linux-2.6.git.orig/fs/proc/task_mmu.c >> >+++ linux-2.6.git/fs/proc/task_mmu.c >> >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru >> > * of how soft-dirty works. >> > */ >> > pte_t ptent = *pte; >> >- ptent = pte_wrprotect(ptent); >> >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >+ >> >+ if (pte_present(ptent)) { >> >+ ptent = pte_wrprotect(ptent); >> >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >+ } else if (is_swap_pte(ptent)) { >> >+ ptent = pte_swp_clear_soft_dirty(ptent); >> >+ } >> >+ >> > set_pte_at(vma->vm_mm, addr, pte, ptent); >> > #endif >> > } >> >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> > for (; addr != end; pte++, addr += PAGE_SIZE) { >> > ptent = *pte; >> >- if (!pte_present(ptent)) >> >- continue; >> > >> > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { >> > clear_soft_dirty(vma, addr, pte); >> > continue; >> > } >> > >> >+ if (!pte_present(ptent)) >> >+ continue; >> >+ >> > page = vm_normal_page(vma, addr, ptent); >> > if (!page) >> > continue; >> >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap >> > flags = PM_PRESENT; >> > page = vm_normal_page(vma, addr, pte); >> > } else if (is_swap_pte(pte)) { >> >- swp_entry_t entry = pte_to_swp_entry(pte); >> >- >> >+ swp_entry_t entry; >> >+ if (pte_swp_soft_dirty(pte)) >> >+ flags2 |= __PM_SOFT_DIRTY; >> >+ entry = pte_to_swp_entry(pte); >> > frame = swp_type(entry) | >> > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); >> > flags = PM_SWAP; >> >Index: linux-2.6.git/include/asm-generic/pgtable.h >> >=================================================================== >> >--- linux-2.6.git.orig/include/asm-generic/pgtable.h >> >+++ linux-2.6.git/include/asm-generic/pgtable.h >> >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> > { >> > return pmd; >> > } >> >+ >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >+{ >> >+ return pte; >> >+} >> >+ >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >+{ >> >+ return 0; >> >+} >> >+ >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >+{ >> >+ return pte; >> >+} >> > #endif >> > >> > #ifndef __HAVE_PFNMAP_TRACKING >> >Index: linux-2.6.git/include/linux/swapops.h >> >=================================================================== >> >--- linux-2.6.git.orig/include/linux/swapops.h >> >+++ linux-2.6.git/include/linux/swapops.h >> >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent >> > swp_entry_t arch_entry; >> > >> > BUG_ON(pte_file(pte)); >> >+ if (pte_swp_soft_dirty(pte)) >> >+ pte = pte_swp_clear_soft_dirty(pte); >> > arch_entry = __pte_to_swp_entry(pte); >> > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); >> > } >> >Index: linux-2.6.git/mm/memory.c >> >=================================================================== >> >--- linux-2.6.git.orig/mm/memory.c >> >+++ linux-2.6.git/mm/memory.c >> >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct >> > exclusive = 1; >> > } >> > flush_icache_page(vma, page); >> >+ if (pte_swp_soft_dirty(orig_pte)) >> >+ pte = pte_mksoft_dirty(pte); >> >> entry = pte_to_swp_entry(orig_pte); >> orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. > >You seem to walk same way with me. >Please look at my stupid questions in this thread. > I see your discussion with Cyrill, however, pte_to_swp_entry and pte_swp_soft_dirty both against orig_pte, where I miss? ;-) >> >> > set_pte_at(mm, address, page_table, pte); >> > if (page == swapcache) >> > do_page_add_anon_rmap(page, vma, address, exclusive); >> >Index: linux-2.6.git/mm/rmap.c >> >=================================================================== >> >--- linux-2.6.git.orig/mm/rmap.c >> >+++ linux-2.6.git/mm/rmap.c >> >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, >> > swp_entry_to_pte(make_hwpoison_entry(page))); >> > } else if (PageAnon(page)) { >> > swp_entry_t entry = { .val = page_private(page) }; >> >+ pte_t swp_pte; >> > >> > if (PageSwapCache(page)) { >> > /* >> >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, >> > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); >> > entry = make_migration_entry(page, pte_write(pteval)); >> > } >> >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); >> >+ swp_pte = swp_entry_to_pte(entry); >> >+ if (pte_soft_dirty(pteval)) >> >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); >> >+ set_pte_at(mm, address, pte, swp_pte); >> > BUG_ON(pte_file(*pte)); >> > } else if (IS_ENABLED(CONFIG_MIGRATION) && >> > (TTU_ACTION(flags) == TTU_MIGRATION)) { >> >Index: linux-2.6.git/mm/swapfile.c >> >=================================================================== >> >--- linux-2.6.git.orig/mm/swapfile.c >> >+++ linux-2.6.git/mm/swapfile.c >> >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, >> > } >> > #endif /* CONFIG_HIBERNATION */ >> > >> >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) >> >+{ >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >+ /* >> >+ * When pte keeps soft dirty bit the pte generated >> >+ * from swap entry does not has it, still it's same >> >+ * pte from logical point of view. >> >+ */ >> >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); >> >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); >> >+#else >> >+ return pte_same(pte, swp_pte); >> >+#endif >> >+} >> >+ >> > /* >> > * No need to decide whether this PTE shares the swap entry with others, >> > * just let do_wp_page work it out if a write is requested later - to >> >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru >> > } >> > >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { >> >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { >> > mem_cgroup_cancel_charge_swapin(memcg); >> > ret = 0; >> > goto out; >> >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are >> > * swapoff spends a _lot_ of time in this loop! >> > * Test inline before going to call unuse_pte. >> > */ >> >- if (unlikely(pte_same(*pte, swp_pte))) { >> >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { >> > pte_unmap(pte); >> > ret = unuse_pte(vma, pmd, addr, entry, page); >> > if (ret) >> > >> >-- >> >To unsubscribe, send a message with 'unsubscribe linux-mm' in >> >the body to majordomo@kvack.org. For more info on Linux MM, >> >see: http://www.linux-mm.org/ . >> >Don't email: email@kvack.org >> >> -- >> To unsubscribe, send a message with 'unsubscribe linux-mm' in >> the body to majordomo@kvack.org. For more info on Linux MM, >> see: http://www.linux-mm.org/ . >> Don't email: email@kvack.org > >-- >Kind regards, >Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: Wanpeng Li Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Date: Mon, 5 Aug 2013 10:58:35 +0800 Message-ID: <28784.617703832$1375671546@news.gmane.org> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> <20130805025437.GK32486@bbox> Reply-To: Wanpeng Li Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Received: from kanga.kvack.org ([205.233.56.17]) by plane.gmane.org with esmtp (Exim 4.69) (envelope-from ) id 1V6B0U-0001zY-FZ for glkm-linux-mm-2@m.gmane.org; Mon, 05 Aug 2013 04:58:51 +0200 Received: from psmtp.com (na3sys010amx187.postini.com [74.125.245.187]) by kanga.kvack.org (Postfix) with SMTP id DCA866B0031 for ; Sun, 4 Aug 2013 22:58:47 -0400 (EDT) Received: from /spool/local by e23smtp09.au.ibm.com with IBM ESMTP SMTP Gateway: Authorized Use Only! Violators will be prosecuted for from ; Mon, 5 Aug 2013 23:53:38 +1000 Received: from d23relay04.au.ibm.com (d23relay04.au.ibm.com [9.190.234.120]) by d23dlp02.au.ibm.com (Postfix) with ESMTP id BDCC62BB0054 for ; Mon, 5 Aug 2013 12:58:42 +1000 (EST) Received: from d23av03.au.ibm.com (d23av03.au.ibm.com [9.190.234.97]) by d23relay04.au.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id r752h0BZ65011806 for ; Mon, 5 Aug 2013 12:43:04 +1000 Received: from d23av03.au.ibm.com (localhost [127.0.0.1]) by d23av03.au.ibm.com (8.14.4/8.14.4/NCO v10.0 AVout) with ESMTP id r752wbqN023183 for ; Mon, 5 Aug 2013 12:58:38 +1000 Content-Disposition: inline In-Reply-To: <20130805025437.GK32486@bbox> Sender: owner-linux-mm@kvack.org List-ID: To: Minchan Kim Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com On Mon, Aug 05, 2013 at 11:54:37AM +0900, Minchan Kim wrote: >On Mon, Aug 05, 2013 at 10:38:58AM +0800, Wanpeng Li wrote: >> Hi Minchan, >> >> On Mon, Aug 05, 2013 at 11:17:15AM +0900, Minchan Kim wrote: >> >Hello Wanpeng, >> > >> >On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: >> >> On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: >> >> >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY >> >> >bit set get swapped out, the bit is getting lost and no longer >> >> >available when pte read back. >> >> > >> >> >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is >> >> >saved in pte entry for the page being swapped out. When such page >> >> >is to be read back from a swap cache we check for bit presence >> >> >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY >> >> >bit back. >> >> > >> >> >One of the problem was to find a place in pte entry where we can >> >> >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The >> >> >_PAGE_PSE was chosen for that, it doesn't intersect with swap >> >> >entry format stored in pte. >> >> > >> >> >Reported-by: Andy Lutomirski >> >> >Signed-off-by: Cyrill Gorcunov >> >> >Cc: Pavel Emelyanov >> >> >Cc: Andrew Morton >> >> >Cc: Matt Mackall >> >> >Cc: Xiao Guangrong >> >> >Cc: Marcelo Tosatti >> >> >Cc: KOSAKI Motohiro >> >> >Cc: Stephen Rothwell >> >> >Cc: Peter Zijlstra >> >> >Cc: "Aneesh Kumar K.V" >> >> >--- >> >> > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ >> >> > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ >> >> > fs/proc/task_mmu.c | 21 +++++++++++++++------ >> >> > include/asm-generic/pgtable.h | 15 +++++++++++++++ >> >> > include/linux/swapops.h | 2 ++ >> >> > mm/memory.c | 2 ++ >> >> > mm/rmap.c | 6 +++++- >> >> > mm/swapfile.c | 19 +++++++++++++++++-- >> >> > 8 files changed, 84 insertions(+), 9 deletions(-) >> >> > >> >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h >> >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h >> >> >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> >> > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); >> >> > } >> >> > >> >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >> >+} >> >> >+ >> >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; >> >> >+} >> >> >+ >> >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); >> >> >+} >> >> >+ >> >> > /* >> >> > * Mask out unsupported bits in a present pgprot. Non-present pgprots >> >> > * can use those bits for other purposes, so leave them be. >> >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h >> >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h >> >> >@@ -67,6 +67,19 @@ >> >> > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) >> >> > #endif >> >> > >> >> >+/* >> >> >+ * Tracking soft dirty bit when a page goes to a swap is tricky. >> >> >+ * We need a bit which can be stored in pte _and_ not conflict >> >> >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved >> >> >+ * into swap entry computation, but bit 6 is used for nonlinear >> >> >+ * file mapping, so we borrow bit 7 for soft dirty tracking. >> >> >+ */ >> >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >> >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE >> >> >+#else >> >> >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) >> >> >+#endif >> >> >+ >> >> > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) >> >> > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) >> >> > #else >> >> >Index: linux-2.6.git/fs/proc/task_mmu.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/fs/proc/task_mmu.c >> >> >+++ linux-2.6.git/fs/proc/task_mmu.c >> >> >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru >> >> > * of how soft-dirty works. >> >> > */ >> >> > pte_t ptent = *pte; >> >> >- ptent = pte_wrprotect(ptent); >> >> >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >> >+ >> >> >+ if (pte_present(ptent)) { >> >> >+ ptent = pte_wrprotect(ptent); >> >> >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); >> >> >+ } else if (is_swap_pte(ptent)) { >> >> >+ ptent = pte_swp_clear_soft_dirty(ptent); >> >> >+ } >> >> >+ >> >> > set_pte_at(vma->vm_mm, addr, pte, ptent); >> >> > #endif >> >> > } >> >> >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p >> >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> >> > for (; addr != end; pte++, addr += PAGE_SIZE) { >> >> > ptent = *pte; >> >> >- if (!pte_present(ptent)) >> >> >- continue; >> >> > >> >> > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { >> >> > clear_soft_dirty(vma, addr, pte); >> >> > continue; >> >> > } >> >> > >> >> >+ if (!pte_present(ptent)) >> >> >+ continue; >> >> >+ >> >> > page = vm_normal_page(vma, addr, ptent); >> >> > if (!page) >> >> > continue; >> >> >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap >> >> > flags = PM_PRESENT; >> >> > page = vm_normal_page(vma, addr, pte); >> >> > } else if (is_swap_pte(pte)) { >> >> >- swp_entry_t entry = pte_to_swp_entry(pte); >> >> >- >> >> >+ swp_entry_t entry; >> >> >+ if (pte_swp_soft_dirty(pte)) >> >> >+ flags2 |= __PM_SOFT_DIRTY; >> >> >+ entry = pte_to_swp_entry(pte); >> >> > frame = swp_type(entry) | >> >> > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); >> >> > flags = PM_SWAP; >> >> >Index: linux-2.6.git/include/asm-generic/pgtable.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/include/asm-generic/pgtable.h >> >> >+++ linux-2.6.git/include/asm-generic/pgtable.h >> >> >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd >> >> > { >> >> > return pmd; >> >> > } >> >> >+ >> >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte; >> >> >+} >> >> >+ >> >> >+static inline int pte_swp_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return 0; >> >> >+} >> >> >+ >> >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) >> >> >+{ >> >> >+ return pte; >> >> >+} >> >> > #endif >> >> > >> >> > #ifndef __HAVE_PFNMAP_TRACKING >> >> >Index: linux-2.6.git/include/linux/swapops.h >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/include/linux/swapops.h >> >> >+++ linux-2.6.git/include/linux/swapops.h >> >> >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent >> >> > swp_entry_t arch_entry; >> >> > >> >> > BUG_ON(pte_file(pte)); >> >> >+ if (pte_swp_soft_dirty(pte)) >> >> >+ pte = pte_swp_clear_soft_dirty(pte); >> >> > arch_entry = __pte_to_swp_entry(pte); >> >> > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); >> >> > } >> >> >Index: linux-2.6.git/mm/memory.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/mm/memory.c >> >> >+++ linux-2.6.git/mm/memory.c >> >> >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct >> >> > exclusive = 1; >> >> > } >> >> > flush_icache_page(vma, page); >> >> >+ if (pte_swp_soft_dirty(orig_pte)) >> >> >+ pte = pte_mksoft_dirty(pte); >> >> >> >> entry = pte_to_swp_entry(orig_pte); >> >> orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. >> > >> >You seem to walk same way with me. >> >Please look at my stupid questions in this thread. >> > >> >> I see your discussion with Cyrill, however, pte_to_swp_entry and pte_swp_soft_dirty >> both against orig_pte, where I miss? ;-) > >pte_to_swp_entry is passed orig_pte by vaule, not a pointer >so although pte_to_swp_entry clear out _PTE_SWP_SOFT_DIRTY, it does it in local-copy. >So orig_pte is never changed. Ouch! Thanks for pointing out. ;-) Reviewed-by: Wanpeng Li > >> >> >> >> >> > set_pte_at(mm, address, page_table, pte); >> >> > if (page == swapcache) >> >> > do_page_add_anon_rmap(page, vma, address, exclusive); >> >> >Index: linux-2.6.git/mm/rmap.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/mm/rmap.c >> >> >+++ linux-2.6.git/mm/rmap.c >> >> >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, >> >> > swp_entry_to_pte(make_hwpoison_entry(page))); >> >> > } else if (PageAnon(page)) { >> >> > swp_entry_t entry = { .val = page_private(page) }; >> >> >+ pte_t swp_pte; >> >> > >> >> > if (PageSwapCache(page)) { >> >> > /* >> >> >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, >> >> > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); >> >> > entry = make_migration_entry(page, pte_write(pteval)); >> >> > } >> >> >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); >> >> >+ swp_pte = swp_entry_to_pte(entry); >> >> >+ if (pte_soft_dirty(pteval)) >> >> >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); >> >> >+ set_pte_at(mm, address, pte, swp_pte); >> >> > BUG_ON(pte_file(*pte)); >> >> > } else if (IS_ENABLED(CONFIG_MIGRATION) && >> >> > (TTU_ACTION(flags) == TTU_MIGRATION)) { >> >> >Index: linux-2.6.git/mm/swapfile.c >> >> >=================================================================== >> >> >--- linux-2.6.git.orig/mm/swapfile.c >> >> >+++ linux-2.6.git/mm/swapfile.c >> >> >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, >> >> > } >> >> > #endif /* CONFIG_HIBERNATION */ >> >> > >> >> >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) >> >> >+{ >> >> >+#ifdef CONFIG_MEM_SOFT_DIRTY >> >> >+ /* >> >> >+ * When pte keeps soft dirty bit the pte generated >> >> >+ * from swap entry does not has it, still it's same >> >> >+ * pte from logical point of view. >> >> >+ */ >> >> >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); >> >> >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); >> >> >+#else >> >> >+ return pte_same(pte, swp_pte); >> >> >+#endif >> >> >+} >> >> >+ >> >> > /* >> >> > * No need to decide whether this PTE shares the swap entry with others, >> >> > * just let do_wp_page work it out if a write is requested later - to >> >> >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru >> >> > } >> >> > >> >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); >> >> >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { >> >> >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { >> >> > mem_cgroup_cancel_charge_swapin(memcg); >> >> > ret = 0; >> >> > goto out; >> >> >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are >> >> > * swapoff spends a _lot_ of time in this loop! >> >> > * Test inline before going to call unuse_pte. >> >> > */ >> >> >- if (unlikely(pte_same(*pte, swp_pte))) { >> >> >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { >> >> > pte_unmap(pte); >> >> > ret = unuse_pte(vma, pmd, addr, entry, page); >> >> > if (ret) >> >> > >> >> >-- >> >> >To unsubscribe, send a message with 'unsubscribe linux-mm' in >> >> >the body to majordomo@kvack.org. For more info on Linux MM, >> >> >see: http://www.linux-mm.org/ . >> >> >Don't email: email@kvack.org >> >> >> >> -- >> >> To unsubscribe, send a message with 'unsubscribe linux-mm' in >> >> the body to majordomo@kvack.org. For more info on Linux MM, >> >> see: http://www.linux-mm.org/ . >> >> Don't email: email@kvack.org >> > >> >-- >> >Kind regards, >> >Minchan Kim >> >> -- >> To unsubscribe, send a message with 'unsubscribe linux-mm' in >> the body to majordomo@kvack.org. For more info on Linux MM, >> see: http://www.linux-mm.org/ . >> Don't email: email@kvack.org > >-- >Kind regards, >Minchan Kim -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757709Ab3G3Uq7 (ORCPT ); Tue, 30 Jul 2013 16:46:59 -0400 Received: from mail-la0-f48.google.com ([209.85.215.48]:65168 "EHLO mail-la0-f48.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756953Ab3G3Uq5 (ORCPT ); Tue, 30 Jul 2013 16:46:57 -0400 Message-Id: <20130730204154.407090410@gmail.com> User-Agent: quilt/0.60-1 Date: Wed, 31 Jul 2013 00:41:54 +0400 From: Cyrill Gorcunov To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: [patch 0/2] Soft-dirty page tracker improvemens Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hi, as being reported by Andy, there are a couple of situations when soft-dirty bit will be lost, in paricular when page we're tracking is going to swap and when file page get reclaimed. In this series both problems are aimed. One more hardness which remains is the scenario when vma area (which has soft-dirty bit set in appropriate pte entries) get unmapped then new one mapped in-place. I'm working on it now hope to provide a patch soon. Thanks, Cyrill From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757831Ab3G3UrG (ORCPT ); Tue, 30 Jul 2013 16:47:06 -0400 Received: from mail-lb0-f175.google.com ([209.85.217.175]:62339 "EHLO mail-lb0-f175.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757033Ab3G3Uq6 (ORCPT ); Tue, 30 Jul 2013 16:46:58 -0400 Message-Id: <20130730204654.966378702@gmail.com> User-Agent: quilt/0.60-1 Date: Wed, 31 Jul 2013 00:41:56 +0400 From: Cyrill Gorcunov To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> Content-Disposition: inline; filename=pte-sft-dirty-file-2 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Andy reported that if file page get reclaimed we loose soft-dirty bit if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address get encoded into pte entry. Thus when #pf happens on such non-present pte we can restore it back. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andrew Morton Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" --- arch/x86/include/asm/pgtable-2level.h | 48 +++++++++++++++++++++++++++++++++- arch/x86/include/asm/pgtable-3level.h | 3 ++ arch/x86/include/asm/pgtable.h | 15 ++++++++++ arch/x86/include/asm/pgtable_types.h | 4 ++ fs/proc/task_mmu.c | 2 + include/asm-generic/pgtable.h | 15 ++++++++++ mm/fremap.c | 11 +++++-- mm/memory.c | 11 +++++-- mm/rmap.c | 8 ++++- 9 files changed, 107 insertions(+), 10 deletions(-) Index: linux-2.6.git/arch/x86/include/asm/pgtable-2level.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable-2level.h +++ linux-2.6.git/arch/x86/include/asm/pgtable-2level.h @@ -55,9 +55,53 @@ static inline pmd_t native_pmdp_get_and_ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_MEM_SOFT_DIRTY + +/* + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset + * into this range. + */ +#define PTE_FILE_MAX_BITS 28 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) +#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) + +#define pte_to_pgoff(pte) \ + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ + & ((1U << PTE_FILE_BITS1) - 1))) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << (PTE_FILE_BITS1)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) \ + & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + & ((1U << PTE_FILE_BITS3) - 1)) \ + << PTE_FILE_SHIFT3) \ + + ((((off) >> \ + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ + << PTE_FILE_SHIFT4) \ + + _PAGE_FILE }) + +#else /* CONFIG_MEM_SOFT_DIRTY */ + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range: + * split up the 29 bits of offset into this range. */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) @@ -88,6 +132,8 @@ static inline pmd_t native_pmdp_get_and_ << PTE_FILE_SHIFT3) \ + _PAGE_FILE }) +#endif /* CONFIG_MEM_SOFT_DIRTY */ + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) Index: linux-2.6.git/arch/x86/include/asm/pgtable-3level.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable-3level.h +++ linux-2.6.git/arch/x86/include/asm/pgtable-3level.h @@ -179,6 +179,9 @@ static inline pmd_t native_pmdp_get_and_ /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. + * + * For soft-dirty tracking 11 bit is taken from + * the low part of pte as well. */ #define pte_to_pgoff(pte) ((pte).pte_high) #define pgoff_to_pte(off) \ Index: linux-2.6.git/arch/x86/include/asm/pgtable.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h +++ linux-2.6.git/arch/x86/include/asm/pgtable.h @@ -329,6 +329,21 @@ static inline pte_t pte_swp_clear_soft_d return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFT_DIRTY; +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h +++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h @@ -61,8 +61,10 @@ * they do not conflict with each other. */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN + #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif Index: linux-2.6.git/fs/proc/task_mmu.c =================================================================== --- linux-2.6.git.orig/fs/proc/task_mmu.c +++ linux-2.6.git/fs/proc/task_mmu.c @@ -736,6 +736,8 @@ static inline void clear_soft_dirty(stru ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + } else if (pte_file(ptent)) { + ptent = pte_file_clear_soft_dirty(ptent); } set_pte_at(vma->vm_mm, addr, pte, ptent); Index: linux-2.6.git/include/asm-generic/pgtable.h =================================================================== --- linux-2.6.git.orig/include/asm-generic/pgtable.h +++ linux-2.6.git/include/asm-generic/pgtable.h @@ -432,6 +432,21 @@ static inline pte_t pte_swp_clear_soft_d { return pte; } + +static inline pte_t pte_file_clear_soft_dirty(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_file_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_file_soft_dirty(pte_t pte) +{ + return 0; +} #endif #ifndef __HAVE_PFNMAP_TRACKING Index: linux-2.6.git/mm/fremap.c =================================================================== --- linux-2.6.git.orig/mm/fremap.c +++ linux-2.6.git/mm/fremap.c @@ -57,17 +57,22 @@ static int install_file_pte(struct mm_st unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; - pte_t *pte; + pte_t *pte, ptfile; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; - if (!pte_none(*pte)) + ptfile = pgoff_to_pte(pgoff); + + if (!pte_none(*pte)) { + if (pte_present(*pte) && pte_soft_dirty(*pte)) + pte_file_mksoft_dirty(ptfile); zap_pte(mm, vma, addr, pte); + } - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + set_pte_at(mm, addr, pte, ptfile); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a Index: linux-2.6.git/mm/memory.c =================================================================== --- linux-2.6.git.orig/mm/memory.c +++ linux-2.6.git/mm/memory.c @@ -1141,9 +1141,12 @@ again: continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + addr) != page->index) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(ptent)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, addr, pte, ptfile); + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -3410,6 +3413,8 @@ static int __do_fault(struct mm_struct * entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) + pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); Index: linux-2.6.git/mm/rmap.c =================================================================== --- linux-2.6.git.orig/mm/rmap.c +++ linux-2.6.git/mm/rmap.c @@ -1405,8 +1405,12 @@ static int try_to_unmap_cluster(unsigned pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + if (page->index != linear_page_index(vma, address)) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(pteval)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, address, pte, ptfile); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757881Ab3G3Urb (ORCPT ); Tue, 30 Jul 2013 16:47:31 -0400 Received: from mail-la0-f44.google.com ([209.85.215.44]:60740 "EHLO mail-la0-f44.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756954Ab3G3Uq6 (ORCPT ); Tue, 30 Jul 2013 16:46:58 -0400 Message-Id: <20130730204654.844299768@gmail.com> User-Agent: quilt/0.60-1 Date: Wed, 31 Jul 2013 00:41:55 +0400 From: Cyrill Gorcunov To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages References: <20130730204154.407090410@gmail.com> Content-Disposition: inline; filename=pte-sft-dirty-swap-4 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY bit set get swapped out, the bit is getting lost and no longer available when pte read back. To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is saved in pte entry for the page being swapped out. When such page is to be read back from a swap cache we check for bit presence and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY bit back. One of the problem was to find a place in pte entry where we can save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The _PAGE_PSE was chosen for that, it doesn't intersect with swap entry format stored in pte. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andrew Morton Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" --- arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ fs/proc/task_mmu.c | 21 +++++++++++++++------ include/asm-generic/pgtable.h | 15 +++++++++++++++ include/linux/swapops.h | 2 ++ mm/memory.c | 2 ++ mm/rmap.c | 6 +++++- mm/swapfile.c | 19 +++++++++++++++++-- 8 files changed, 84 insertions(+), 9 deletions(-) Index: linux-2.6.git/arch/x86/include/asm/pgtable.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h +++ linux-2.6.git/arch/x86/include/asm/pgtable.h @@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h +++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h @@ -67,6 +67,19 @@ #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +/* + * Tracking soft dirty bit when a page goes to a swap is tricky. + * We need a bit which can be stored in pte _and_ not conflict + * with swap entry format. On x86 bits 6 and 7 are *not* involved + * into swap entry computation, but bit 6 is used for nonlinear + * file mapping, so we borrow bit 7 for soft dirty tracking. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#else +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #else Index: linux-2.6.git/fs/proc/task_mmu.c =================================================================== --- linux-2.6.git.orig/fs/proc/task_mmu.c +++ linux-2.6.git/fs/proc/task_mmu.c @@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru * of how soft-dirty works. */ pte_t ptent = *pte; - ptent = pte_wrprotect(ptent); - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + + if (pte_present(ptent)) { + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_clear_soft_dirty(ptent); + } + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; - if (!pte_present(ptent)) - continue; if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } + if (!pte_present(ptent)) + continue; + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); } else if (is_swap_pte(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; + entry = pte_to_swp_entry(pte); frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags = PM_SWAP; Index: linux-2.6.git/include/asm-generic/pgtable.h =================================================================== --- linux-2.6.git.orig/include/asm-generic/pgtable.h +++ linux-2.6.git/include/asm-generic/pgtable.h @@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd { return pmd; } + +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return pte; +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return pte; +} #endif #ifndef __HAVE_PFNMAP_TRACKING Index: linux-2.6.git/include/linux/swapops.h =================================================================== --- linux-2.6.git.orig/include/linux/swapops.h +++ linux-2.6.git/include/linux/swapops.h @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent swp_entry_t arch_entry; BUG_ON(pte_file(pte)); + if (pte_swp_soft_dirty(pte)) + pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } Index: linux-2.6.git/mm/memory.c =================================================================== --- linux-2.6.git.orig/mm/memory.c +++ linux-2.6.git/mm/memory.c @@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct exclusive = 1; } flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); if (page == swapcache) do_page_add_anon_rmap(page, vma, address, exclusive); Index: linux-2.6.git/mm/rmap.c =================================================================== --- linux-2.6.git.orig/mm/rmap.c +++ linux-2.6.git/mm/rmap.c @@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; + pte_t swp_pte; if (PageSwapCache(page)) { /* @@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (TTU_ACTION(flags) == TTU_MIGRATION)) { Index: linux-2.6.git/mm/swapfile.c =================================================================== --- linux-2.6.git.orig/mm/swapfile.c +++ linux-2.6.git/mm/swapfile.c @@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, } #endif /* CONFIG_HIBERNATION */ +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; @@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(pte_same(*pte, swp_pte))) { + if (unlikely(maybe_same_pte(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1759241Ab3GaIRP (ORCPT ); Wed, 31 Jul 2013 04:17:15 -0400 Received: from relay.parallels.com ([195.214.232.42]:55102 "EHLO relay.parallels.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1759217Ab3GaIRM (ORCPT ); Wed, 31 Jul 2013 04:17:12 -0400 Message-ID: <51F8C7F4.9020504@parallels.com> Date: Wed, 31 Jul 2013 12:16:52 +0400 From: Pavel Emelyanov User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20120605 Thunderbird/13.0 MIME-Version: 1.0 To: Cyrill Gorcunov , CC: , , , , , , , , , , Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> In-Reply-To: <20130730204654.966378702@gmail.com> Content-Type: text/plain; charset="ISO-8859-1" Content-Transfer-Encoding: 7bit X-Originating-IP: [10.30.16.114] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On 07/31/2013 12:41 AM, Cyrill Gorcunov wrote: > Andy reported that if file page get reclaimed we loose soft-dirty bit > if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address > get encoded into pte entry. Thus when #pf happens on such non-present > pte we can restore it back. > > Reported-by: Andy Lutomirski > Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1759259Ab3GaIUW (ORCPT ); Wed, 31 Jul 2013 04:20:22 -0400 Received: from relay.parallels.com ([195.214.232.42]:55028 "EHLO relay.parallels.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1759172Ab3GaIQp (ORCPT ); Wed, 31 Jul 2013 04:16:45 -0400 Message-ID: <51F8C7CC.6010703@parallels.com> Date: Wed, 31 Jul 2013 12:16:12 +0400 From: Pavel Emelyanov User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20120605 Thunderbird/13.0 MIME-Version: 1.0 To: Cyrill Gorcunov , CC: , , , , , , , , , , Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> In-Reply-To: <20130730204654.844299768@gmail.com> Content-Type: text/plain; charset="ISO-8859-1" Content-Transfer-Encoding: 7bit X-Originating-IP: [10.30.16.114] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On 07/31/2013 12:41 AM, Cyrill Gorcunov wrote: > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > bit set get swapped out, the bit is getting lost and no longer > available when pte read back. > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > saved in pte entry for the page being swapped out. When such page > is to be read back from a swap cache we check for bit presence > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > bit back. > > One of the problem was to find a place in pte entry where we can > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > _PAGE_PSE was chosen for that, it doesn't intersect with swap > entry format stored in pte. > > Reported-by: Andy Lutomirski > Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751981Ab3HAAvJ (ORCPT ); Wed, 31 Jul 2013 20:51:09 -0400 Received: from LGEMRELSE7Q.lge.com ([156.147.1.151]:42044 "EHLO LGEMRELSE7Q.lge.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751156Ab3HAAvH (ORCPT ); Wed, 31 Jul 2013 20:51:07 -0400 X-AuditID: 9c930197-b7bfbae000000e88-4d-51f9b0f802d5 Date: Thu, 1 Aug 2013 09:51:32 +0900 From: Minchan Kim To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801005132.GB19540@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130730204654.844299768@gmail.com> User-Agent: Mutt/1.5.21 (2010-09-15) X-Brightmail-Tracker: AAAAAA== Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hello, On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > bit set get swapped out, the bit is getting lost and no longer > available when pte read back. > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > saved in pte entry for the page being swapped out. When such page > is to be read back from a swap cache we check for bit presence > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > bit back. > > One of the problem was to find a place in pte entry where we can > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > _PAGE_PSE was chosen for that, it doesn't intersect with swap > entry format stored in pte. > > Reported-by: Andy Lutomirski > Signed-off-by: Cyrill Gorcunov > Cc: Pavel Emelyanov > Cc: Andrew Morton > Cc: Matt Mackall > Cc: Xiao Guangrong > Cc: Marcelo Tosatti > Cc: KOSAKI Motohiro > Cc: Stephen Rothwell > Cc: Peter Zijlstra > Cc: "Aneesh Kumar K.V" > --- > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > fs/proc/task_mmu.c | 21 +++++++++++++++------ > include/asm-generic/pgtable.h | 15 +++++++++++++++ > include/linux/swapops.h | 2 ++ > mm/memory.c | 2 ++ > mm/rmap.c | 6 +++++- > mm/swapfile.c | 19 +++++++++++++++++-- > 8 files changed, 84 insertions(+), 9 deletions(-) > > Index: linux-2.6.git/arch/x86/include/asm/pgtable.h > =================================================================== > --- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h > +++ linux-2.6.git/arch/x86/include/asm/pgtable.h > @@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > } > > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > +{ > + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); > +} > + > +static inline int pte_swp_soft_dirty(pte_t pte) > +{ > + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; > +} > + > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > +{ > + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); > +} > + > /* > * Mask out unsupported bits in a present pgprot. Non-present pgprots > * can use those bits for other purposes, so leave them be. > Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h > =================================================================== > --- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h > +++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h > @@ -67,6 +67,19 @@ > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > #endif > > +/* > + * Tracking soft dirty bit when a page goes to a swap is tricky. > + * We need a bit which can be stored in pte _and_ not conflict > + * with swap entry format. On x86 bits 6 and 7 are *not* involved > + * into swap entry computation, but bit 6 is used for nonlinear > + * file mapping, so we borrow bit 7 for soft dirty tracking. > + */ > +#ifdef CONFIG_MEM_SOFT_DIRTY > +#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE > +#else > +#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) > +#endif > + > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > #else > Index: linux-2.6.git/fs/proc/task_mmu.c > =================================================================== > --- linux-2.6.git.orig/fs/proc/task_mmu.c > +++ linux-2.6.git/fs/proc/task_mmu.c > @@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > * of how soft-dirty works. > */ > pte_t ptent = *pte; > - ptent = pte_wrprotect(ptent); > - ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > + > + if (pte_present(ptent)) { > + ptent = pte_wrprotect(ptent); > + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > + } else if (is_swap_pte(ptent)) { > + ptent = pte_swp_clear_soft_dirty(ptent); > + } > + > set_pte_at(vma->vm_mm, addr, pte, ptent); > #endif > } > @@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > for (; addr != end; pte++, addr += PAGE_SIZE) { > ptent = *pte; > - if (!pte_present(ptent)) > - continue; > > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > clear_soft_dirty(vma, addr, pte); > continue; > } > > + if (!pte_present(ptent)) > + continue; > + > page = vm_normal_page(vma, addr, ptent); > if (!page) > continue; > @@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > flags = PM_PRESENT; > page = vm_normal_page(vma, addr, pte); > } else if (is_swap_pte(pte)) { > - swp_entry_t entry = pte_to_swp_entry(pte); > - > + swp_entry_t entry; > + if (pte_swp_soft_dirty(pte)) > + flags2 |= __PM_SOFT_DIRTY; > + entry = pte_to_swp_entry(pte); > frame = swp_type(entry) | > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > flags = PM_SWAP; > Index: linux-2.6.git/include/asm-generic/pgtable.h > =================================================================== > --- linux-2.6.git.orig/include/asm-generic/pgtable.h > +++ linux-2.6.git/include/asm-generic/pgtable.h > @@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > { > return pmd; > } > + > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > +{ > + return pte; > +} > + > +static inline int pte_swp_soft_dirty(pte_t pte) > +{ > + return 0; > +} > + > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > +{ > + return pte; > +} > #endif > > #ifndef __HAVE_PFNMAP_TRACKING > Index: linux-2.6.git/include/linux/swapops.h > =================================================================== > --- linux-2.6.git.orig/include/linux/swapops.h > +++ linux-2.6.git/include/linux/swapops.h > @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > swp_entry_t arch_entry; > > BUG_ON(pte_file(pte)); > + if (pte_swp_soft_dirty(pte)) > + pte = pte_swp_clear_soft_dirty(pte); Why do you remove soft-dirty flag whenever pte_to_swp_entry is called? Isn't there any problem if we use mincore? > arch_entry = __pte_to_swp_entry(pte); > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > } > Index: linux-2.6.git/mm/memory.c > =================================================================== > --- linux-2.6.git.orig/mm/memory.c > +++ linux-2.6.git/mm/memory.c > @@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > exclusive = 1; > } > flush_icache_page(vma, page); > + if (pte_swp_soft_dirty(orig_pte)) > + pte = pte_mksoft_dirty(pte); > set_pte_at(mm, address, page_table, pte); > if (page == swapcache) > do_page_add_anon_rmap(page, vma, address, exclusive); > Index: linux-2.6.git/mm/rmap.c > =================================================================== > --- linux-2.6.git.orig/mm/rmap.c > +++ linux-2.6.git/mm/rmap.c > @@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > swp_entry_to_pte(make_hwpoison_entry(page))); > } else if (PageAnon(page)) { > swp_entry_t entry = { .val = page_private(page) }; > + pte_t swp_pte; > > if (PageSwapCache(page)) { > /* > @@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > entry = make_migration_entry(page, pte_write(pteval)); > } > - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); > + swp_pte = swp_entry_to_pte(entry); > + if (pte_soft_dirty(pteval)) > + swp_pte = pte_swp_mksoft_dirty(swp_pte); > + set_pte_at(mm, address, pte, swp_pte); > BUG_ON(pte_file(*pte)); > } else if (IS_ENABLED(CONFIG_MIGRATION) && > (TTU_ACTION(flags) == TTU_MIGRATION)) { > Index: linux-2.6.git/mm/swapfile.c > =================================================================== > --- linux-2.6.git.orig/mm/swapfile.c > +++ linux-2.6.git/mm/swapfile.c > @@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > } > #endif /* CONFIG_HIBERNATION */ > > +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) Nitpick. If maybe_same_pte is used widely, it looks good to me but it's used for only swapoff at the moment so I think pte_swap_same would be better name. > +{ > +#ifdef CONFIG_MEM_SOFT_DIRTY > + /* > + * When pte keeps soft dirty bit the pte generated > + * from swap entry does not has it, still it's same > + * pte from logical point of view. > + */ > + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); > + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); > +#else > + return pte_same(pte, swp_pte); > +#endif > +} > + > /* > * No need to decide whether this PTE shares the swap entry with others, > * just let do_wp_page work it out if a write is requested later - to > @@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > } > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { > + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > mem_cgroup_cancel_charge_swapin(memcg); > ret = 0; > goto out; > @@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > * swapoff spends a _lot_ of time in this loop! > * Test inline before going to call unuse_pte. > */ > - if (unlikely(pte_same(*pte, swp_pte))) { > + if (unlikely(maybe_same_pte(*pte, swp_pte))) { > pte_unmap(pte); > ret = unuse_pte(vma, pmd, addr, entry, page); > if (ret) > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753488Ab3HAFxW (ORCPT ); Thu, 1 Aug 2013 01:53:22 -0400 Received: from mail-lb0-f170.google.com ([209.85.217.170]:36775 "EHLO mail-lb0-f170.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751882Ab3HAFxU (ORCPT ); Thu, 1 Aug 2013 01:53:20 -0400 Date: Thu, 1 Aug 2013 09:53:03 +0400 From: Cyrill Gorcunov To: Minchan Kim Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801055303.GA1764@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801005132.GB19540@bbox> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, Aug 01, 2013 at 09:51:32AM +0900, Minchan Kim wrote: > > Index: linux-2.6.git/include/linux/swapops.h > > =================================================================== > > --- linux-2.6.git.orig/include/linux/swapops.h > > +++ linux-2.6.git/include/linux/swapops.h > > @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > > swp_entry_t arch_entry; > > > > BUG_ON(pte_file(pte)); > > + if (pte_swp_soft_dirty(pte)) > > + pte = pte_swp_clear_soft_dirty(pte); > > Why do you remove soft-dirty flag whenever pte_to_swp_entry is called? > Isn't there any problem if we use mincore? No, there is no problem. pte_to_swp_entry caller when we know that pte we're decoding is having swap format (except the case in swap code which figures out the number of bits allowed for offset). Still since this bit is set on "higher" level than __swp_type/__swp_offset helpers it should be cleaned before the value from pte comes to "one level down" helpers function. > > +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > > Nitpick. > If maybe_same_pte is used widely, it looks good to me > but it's used for only swapoff at the moment so I think pte_swap_same > would be better name. I don't see much difference, but sure, lets rename it on top once series in -mm tree, sounds good? Cyrill From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754409Ab3HAGQI (ORCPT ); Thu, 1 Aug 2013 02:16:08 -0400 Received: from LGEMRELSE1Q.lge.com ([156.147.1.111]:60529 "EHLO LGEMRELSE1Q.lge.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753271Ab3HAGQF (ORCPT ); Thu, 1 Aug 2013 02:16:05 -0400 X-AuditID: 9c93016f-b7b05ae000002bbd-d8-51f9fd24a9fb Date: Thu, 1 Aug 2013 15:16:32 +0900 From: Minchan Kim To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801061632.GE19540@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801055303.GA1764@moon> User-Agent: Mutt/1.5.21 (2010-09-15) X-Brightmail-Tracker: AAAAAA== Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, Aug 01, 2013 at 09:53:03AM +0400, Cyrill Gorcunov wrote: > On Thu, Aug 01, 2013 at 09:51:32AM +0900, Minchan Kim wrote: > > > Index: linux-2.6.git/include/linux/swapops.h > > > =================================================================== > > > --- linux-2.6.git.orig/include/linux/swapops.h > > > +++ linux-2.6.git/include/linux/swapops.h > > > @@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > > > swp_entry_t arch_entry; > > > > > > BUG_ON(pte_file(pte)); > > > + if (pte_swp_soft_dirty(pte)) > > > + pte = pte_swp_clear_soft_dirty(pte); > > > > Why do you remove soft-dirty flag whenever pte_to_swp_entry is called? > > Isn't there any problem if we use mincore? > > No, there is no problem. pte_to_swp_entry caller when we know that pte > we're decoding is having swap format (except the case in swap code which > figures out the number of bits allowed for offset). Still since this bit > is set on "higher" level than __swp_type/__swp_offset helpers it should > be cleaned before the value from pte comes to "one level down" helpers > function. I don't get it. Could you correct me with below example? Process A context try_to_unmap swp_pte = swp_entry_to_pte /* change generic swp into arch swap */ swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(, swp_pte); Process A context .. mincore_pte_range pte_to_swp_entry pte = pte_swp_clear_soft_dirty <=== 1) change arch swp with generic swp mincore_page Process B want to know dirty state of the page .. pagemap_read pte_to_pagemap_entry is_swap_pte if (pte_swap_soft_dirty(pte)) <=== but failed by 1) So, Process B can't get the dirty status from process A's the page. > > > > +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > > > > Nitpick. > > If maybe_same_pte is used widely, it looks good to me > > but it's used for only swapoff at the moment so I think pte_swap_same > > would be better name. > > I don't see much difference, but sure, lets rename it on top once series > in -mm tree, sounds good? > > Cyrill > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752576Ab3HAG2T (ORCPT ); Thu, 1 Aug 2013 02:28:19 -0400 Received: from mail-la0-f42.google.com ([209.85.215.42]:35282 "EHLO mail-la0-f42.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750850Ab3HAG2S (ORCPT ); Thu, 1 Aug 2013 02:28:18 -0400 Date: Thu, 1 Aug 2013 10:28:14 +0400 From: Cyrill Gorcunov To: Minchan Kim Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801062814.GB1764@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> <20130801061632.GE19540@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801061632.GE19540@bbox> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, Aug 01, 2013 at 03:16:32PM +0900, Minchan Kim wrote: > > I don't get it. Could you correct me with below example? > > Process A context > try_to_unmap > swp_pte = swp_entry_to_pte /* change generic swp into arch swap */ > swp_pte = pte_swp_mksoft_dirty(swp_pte); > set_pte_at(, swp_pte); > > Process A context > .. > mincore_pte_range pte_t pte = *ptep; <-- local copy of the pte value, in memory it remains the same with swap softdirty bit set > pte_to_swp_entry > pte = pte_swp_clear_soft_dirty <=== 1) > change arch swp with generic swp > mincore_page > > Process B want to know dirty state of the page > .. > pagemap_read > pte_to_pagemap_entry > is_swap_pte > if (pte_swap_soft_dirty(pte)) <=== but failed by 1) > > So, Process B can't get the dirty status from process A's the page. From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751296Ab3HAGgl (ORCPT ); Thu, 1 Aug 2013 02:36:41 -0400 Received: from LGEMRELSE6Q.lge.com ([156.147.1.121]:49976 "EHLO LGEMRELSE6Q.lge.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750742Ab3HAGgk (ORCPT ); Thu, 1 Aug 2013 02:36:40 -0400 X-AuditID: 9c930179-b7c49ae000000e68-b9-51fa01f64c2e Date: Thu, 1 Aug 2013 15:37:06 +0900 From: Minchan Kim To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801063706.GF19540@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> <20130801061632.GE19540@bbox> <20130801062814.GB1764@moon> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801062814.GB1764@moon> User-Agent: Mutt/1.5.21 (2010-09-15) X-Brightmail-Tracker: AAAAAA== Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, Aug 01, 2013 at 10:28:14AM +0400, Cyrill Gorcunov wrote: > On Thu, Aug 01, 2013 at 03:16:32PM +0900, Minchan Kim wrote: > > > > I don't get it. Could you correct me with below example? > > > > Process A context > > try_to_unmap > > swp_pte = swp_entry_to_pte /* change generic swp into arch swap */ > > swp_pte = pte_swp_mksoft_dirty(swp_pte); > > set_pte_at(, swp_pte); > > > > Process A context > > .. > > mincore_pte_range > pte_t pte = *ptep; <-- local copy of the pte value, in memory it remains the same > with swap softdirty bit set Argh, I missed that. Thank you! Reviewed-by: Minchan Kim -- Kind regards, Minchan Kim From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751248Ab3HAGiS (ORCPT ); Thu, 1 Aug 2013 02:38:18 -0400 Received: from mail-la0-f50.google.com ([209.85.215.50]:51762 "EHLO mail-la0-f50.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750850Ab3HAGiR (ORCPT ); Thu, 1 Aug 2013 02:38:17 -0400 Date: Thu, 1 Aug 2013 10:38:13 +0400 From: Cyrill Gorcunov To: Minchan Kim Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130801063813.GC1764@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130801005132.GB19540@bbox> <20130801055303.GA1764@moon> <20130801061632.GE19540@bbox> <20130801062814.GB1764@moon> <20130801063706.GF19540@bbox> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130801063706.GF19540@bbox> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, Aug 01, 2013 at 03:37:06PM +0900, Minchan Kim wrote: > > Reviewed-by: Minchan Kim Thanks a lot for review, Minchan! From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754327Ab3HECQh (ORCPT ); Sun, 4 Aug 2013 22:16:37 -0400 Received: from lgeamrelo01.lge.com ([156.147.1.125]:63575 "EHLO LGEAMRELO01.lge.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754135Ab3HECQg (ORCPT ); Sun, 4 Aug 2013 22:16:36 -0400 X-AuditID: 9c93017d-b7b45ae000000e34-53-51ff0b024307 Date: Mon, 5 Aug 2013 11:17:15 +0900 From: Minchan Kim To: Wanpeng Li Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805021715.GJ32486@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> User-Agent: Mutt/1.5.21 (2010-09-15) X-Brightmail-Tracker: AAAAAA== Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hello Wanpeng, On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: > On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: > >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > >bit set get swapped out, the bit is getting lost and no longer > >available when pte read back. > > > >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > >saved in pte entry for the page being swapped out. When such page > >is to be read back from a swap cache we check for bit presence > >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > >bit back. > > > >One of the problem was to find a place in pte entry where we can > >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > >_PAGE_PSE was chosen for that, it doesn't intersect with swap > >entry format stored in pte. > > > >Reported-by: Andy Lutomirski > >Signed-off-by: Cyrill Gorcunov > >Cc: Pavel Emelyanov > >Cc: Andrew Morton > >Cc: Matt Mackall > >Cc: Xiao Guangrong > >Cc: Marcelo Tosatti > >Cc: KOSAKI Motohiro > >Cc: Stephen Rothwell > >Cc: Peter Zijlstra > >Cc: "Aneesh Kumar K.V" > >--- > > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > > fs/proc/task_mmu.c | 21 +++++++++++++++------ > > include/asm-generic/pgtable.h | 15 +++++++++++++++ > > include/linux/swapops.h | 2 ++ > > mm/memory.c | 2 ++ > > mm/rmap.c | 6 +++++- > > mm/swapfile.c | 19 +++++++++++++++++-- > > 8 files changed, 84 insertions(+), 9 deletions(-) > > > >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h > >=================================================================== > >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h > >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h > >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > > } > > > >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >+{ > >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >+} > >+ > >+static inline int pte_swp_soft_dirty(pte_t pte) > >+{ > >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; > >+} > >+ > >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >+{ > >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >+} > >+ > > /* > > * Mask out unsupported bits in a present pgprot. Non-present pgprots > > * can use those bits for other purposes, so leave them be. > >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >=================================================================== > >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h > >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >@@ -67,6 +67,19 @@ > > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > > #endif > > > >+/* > >+ * Tracking soft dirty bit when a page goes to a swap is tricky. > >+ * We need a bit which can be stored in pte _and_ not conflict > >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved > >+ * into swap entry computation, but bit 6 is used for nonlinear > >+ * file mapping, so we borrow bit 7 for soft dirty tracking. > >+ */ > >+#ifdef CONFIG_MEM_SOFT_DIRTY > >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE > >+#else > >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) > >+#endif > >+ > > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > > #else > >Index: linux-2.6.git/fs/proc/task_mmu.c > >=================================================================== > >--- linux-2.6.git.orig/fs/proc/task_mmu.c > >+++ linux-2.6.git/fs/proc/task_mmu.c > >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > > * of how soft-dirty works. > > */ > > pte_t ptent = *pte; > >- ptent = pte_wrprotect(ptent); > >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >+ > >+ if (pte_present(ptent)) { > >+ ptent = pte_wrprotect(ptent); > >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >+ } else if (is_swap_pte(ptent)) { > >+ ptent = pte_swp_clear_soft_dirty(ptent); > >+ } > >+ > > set_pte_at(vma->vm_mm, addr, pte, ptent); > > #endif > > } > >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > > for (; addr != end; pte++, addr += PAGE_SIZE) { > > ptent = *pte; > >- if (!pte_present(ptent)) > >- continue; > > > > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > > clear_soft_dirty(vma, addr, pte); > > continue; > > } > > > >+ if (!pte_present(ptent)) > >+ continue; > >+ > > page = vm_normal_page(vma, addr, ptent); > > if (!page) > > continue; > >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > > flags = PM_PRESENT; > > page = vm_normal_page(vma, addr, pte); > > } else if (is_swap_pte(pte)) { > >- swp_entry_t entry = pte_to_swp_entry(pte); > >- > >+ swp_entry_t entry; > >+ if (pte_swp_soft_dirty(pte)) > >+ flags2 |= __PM_SOFT_DIRTY; > >+ entry = pte_to_swp_entry(pte); > > frame = swp_type(entry) | > > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > > flags = PM_SWAP; > >Index: linux-2.6.git/include/asm-generic/pgtable.h > >=================================================================== > >--- linux-2.6.git.orig/include/asm-generic/pgtable.h > >+++ linux-2.6.git/include/asm-generic/pgtable.h > >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > > { > > return pmd; > > } > >+ > >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >+{ > >+ return pte; > >+} > >+ > >+static inline int pte_swp_soft_dirty(pte_t pte) > >+{ > >+ return 0; > >+} > >+ > >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >+{ > >+ return pte; > >+} > > #endif > > > > #ifndef __HAVE_PFNMAP_TRACKING > >Index: linux-2.6.git/include/linux/swapops.h > >=================================================================== > >--- linux-2.6.git.orig/include/linux/swapops.h > >+++ linux-2.6.git/include/linux/swapops.h > >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > > swp_entry_t arch_entry; > > > > BUG_ON(pte_file(pte)); > >+ if (pte_swp_soft_dirty(pte)) > >+ pte = pte_swp_clear_soft_dirty(pte); > > arch_entry = __pte_to_swp_entry(pte); > > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > > } > >Index: linux-2.6.git/mm/memory.c > >=================================================================== > >--- linux-2.6.git.orig/mm/memory.c > >+++ linux-2.6.git/mm/memory.c > >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > > exclusive = 1; > > } > > flush_icache_page(vma, page); > >+ if (pte_swp_soft_dirty(orig_pte)) > >+ pte = pte_mksoft_dirty(pte); > > entry = pte_to_swp_entry(orig_pte); > orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. You seem to walk same way with me. Please look at my stupid questions in this thread. > > > set_pte_at(mm, address, page_table, pte); > > if (page == swapcache) > > do_page_add_anon_rmap(page, vma, address, exclusive); > >Index: linux-2.6.git/mm/rmap.c > >=================================================================== > >--- linux-2.6.git.orig/mm/rmap.c > >+++ linux-2.6.git/mm/rmap.c > >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > > swp_entry_to_pte(make_hwpoison_entry(page))); > > } else if (PageAnon(page)) { > > swp_entry_t entry = { .val = page_private(page) }; > >+ pte_t swp_pte; > > > > if (PageSwapCache(page)) { > > /* > >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > > entry = make_migration_entry(page, pte_write(pteval)); > > } > >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); > >+ swp_pte = swp_entry_to_pte(entry); > >+ if (pte_soft_dirty(pteval)) > >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); > >+ set_pte_at(mm, address, pte, swp_pte); > > BUG_ON(pte_file(*pte)); > > } else if (IS_ENABLED(CONFIG_MIGRATION) && > > (TTU_ACTION(flags) == TTU_MIGRATION)) { > >Index: linux-2.6.git/mm/swapfile.c > >=================================================================== > >--- linux-2.6.git.orig/mm/swapfile.c > >+++ linux-2.6.git/mm/swapfile.c > >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > > } > > #endif /* CONFIG_HIBERNATION */ > > > >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > >+{ > >+#ifdef CONFIG_MEM_SOFT_DIRTY > >+ /* > >+ * When pte keeps soft dirty bit the pte generated > >+ * from swap entry does not has it, still it's same > >+ * pte from logical point of view. > >+ */ > >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); > >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); > >+#else > >+ return pte_same(pte, swp_pte); > >+#endif > >+} > >+ > > /* > > * No need to decide whether this PTE shares the swap entry with others, > > * just let do_wp_page work it out if a write is requested later - to > >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > > } > > > > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { > >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > > mem_cgroup_cancel_charge_swapin(memcg); > > ret = 0; > > goto out; > >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > > * swapoff spends a _lot_ of time in this loop! > > * Test inline before going to call unuse_pte. > > */ > >- if (unlikely(pte_same(*pte, swp_pte))) { > >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { > > pte_unmap(pte); > > ret = unuse_pte(vma, pmd, addr, entry, page); > > if (ret) > > > >-- > >To unsubscribe, send a message with 'unsubscribe linux-mm' in > >the body to majordomo@kvack.org. For more info on Linux MM, > >see: http://www.linux-mm.org/ . > >Don't email: email@kvack.org > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754498Ab3HECx7 (ORCPT ); Sun, 4 Aug 2013 22:53:59 -0400 Received: from LGEMRELSE6Q.lge.com ([156.147.1.121]:43353 "EHLO LGEMRELSE6Q.lge.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754458Ab3HECx5 (ORCPT ); Sun, 4 Aug 2013 22:53:57 -0400 X-AuditID: 9c930179-b7c49ae000000e68-59-51ff13c45b24 Date: Mon, 5 Aug 2013 11:54:37 +0900 From: Minchan Kim To: Wanpeng Li Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805025437.GK32486@bbox> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> User-Agent: Mutt/1.5.21 (2010-09-15) X-Brightmail-Tracker: AAAAAA== Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Mon, Aug 05, 2013 at 10:38:58AM +0800, Wanpeng Li wrote: > Hi Minchan, > > On Mon, Aug 05, 2013 at 11:17:15AM +0900, Minchan Kim wrote: > >Hello Wanpeng, > > > >On Mon, Aug 05, 2013 at 09:48:29AM +0800, Wanpeng Li wrote: > >> On Wed, Jul 31, 2013 at 12:41:55AM +0400, Cyrill Gorcunov wrote: > >> >Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > >> >bit set get swapped out, the bit is getting lost and no longer > >> >available when pte read back. > >> > > >> >To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > >> >saved in pte entry for the page being swapped out. When such page > >> >is to be read back from a swap cache we check for bit presence > >> >and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > >> >bit back. > >> > > >> >One of the problem was to find a place in pte entry where we can > >> >save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > >> >_PAGE_PSE was chosen for that, it doesn't intersect with swap > >> >entry format stored in pte. > >> > > >> >Reported-by: Andy Lutomirski > >> >Signed-off-by: Cyrill Gorcunov > >> >Cc: Pavel Emelyanov > >> >Cc: Andrew Morton > >> >Cc: Matt Mackall > >> >Cc: Xiao Guangrong > >> >Cc: Marcelo Tosatti > >> >Cc: KOSAKI Motohiro > >> >Cc: Stephen Rothwell > >> >Cc: Peter Zijlstra > >> >Cc: "Aneesh Kumar K.V" > >> >--- > >> > arch/x86/include/asm/pgtable.h | 15 +++++++++++++++ > >> > arch/x86/include/asm/pgtable_types.h | 13 +++++++++++++ > >> > fs/proc/task_mmu.c | 21 +++++++++++++++------ > >> > include/asm-generic/pgtable.h | 15 +++++++++++++++ > >> > include/linux/swapops.h | 2 ++ > >> > mm/memory.c | 2 ++ > >> > mm/rmap.c | 6 +++++- > >> > mm/swapfile.c | 19 +++++++++++++++++-- > >> > 8 files changed, 84 insertions(+), 9 deletions(-) > >> > > >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable.h > >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable.h > >> >@@ -314,6 +314,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > >> > return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); > >> > } > >> > > >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >> >+{ > >> >+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >> >+} > >> >+ > >> >+static inline int pte_swp_soft_dirty(pte_t pte) > >> >+{ > >> >+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; > >> >+} > >> >+ > >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >> >+{ > >> >+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); > >> >+} > >> >+ > >> > /* > >> > * Mask out unsupported bits in a present pgprot. Non-present pgprots > >> > * can use those bits for other purposes, so leave them be. > >> >Index: linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/arch/x86/include/asm/pgtable_types.h > >> >+++ linux-2.6.git/arch/x86/include/asm/pgtable_types.h > >> >@@ -67,6 +67,19 @@ > >> > #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) > >> > #endif > >> > > >> >+/* > >> >+ * Tracking soft dirty bit when a page goes to a swap is tricky. > >> >+ * We need a bit which can be stored in pte _and_ not conflict > >> >+ * with swap entry format. On x86 bits 6 and 7 are *not* involved > >> >+ * into swap entry computation, but bit 6 is used for nonlinear > >> >+ * file mapping, so we borrow bit 7 for soft dirty tracking. > >> >+ */ > >> >+#ifdef CONFIG_MEM_SOFT_DIRTY > >> >+#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE > >> >+#else > >> >+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) > >> >+#endif > >> >+ > >> > #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) > >> > #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) > >> > #else > >> >Index: linux-2.6.git/fs/proc/task_mmu.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/fs/proc/task_mmu.c > >> >+++ linux-2.6.git/fs/proc/task_mmu.c > >> >@@ -730,8 +730,14 @@ static inline void clear_soft_dirty(stru > >> > * of how soft-dirty works. > >> > */ > >> > pte_t ptent = *pte; > >> >- ptent = pte_wrprotect(ptent); > >> >- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >> >+ > >> >+ if (pte_present(ptent)) { > >> >+ ptent = pte_wrprotect(ptent); > >> >+ ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); > >> >+ } else if (is_swap_pte(ptent)) { > >> >+ ptent = pte_swp_clear_soft_dirty(ptent); > >> >+ } > >> >+ > >> > set_pte_at(vma->vm_mm, addr, pte, ptent); > >> > #endif > >> > } > >> >@@ -752,14 +758,15 @@ static int clear_refs_pte_range(pmd_t *p > >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > >> > for (; addr != end; pte++, addr += PAGE_SIZE) { > >> > ptent = *pte; > >> >- if (!pte_present(ptent)) > >> >- continue; > >> > > >> > if (cp->type == CLEAR_REFS_SOFT_DIRTY) { > >> > clear_soft_dirty(vma, addr, pte); > >> > continue; > >> > } > >> > > >> >+ if (!pte_present(ptent)) > >> >+ continue; > >> >+ > >> > page = vm_normal_page(vma, addr, ptent); > >> > if (!page) > >> > continue; > >> >@@ -930,8 +937,10 @@ static void pte_to_pagemap_entry(pagemap > >> > flags = PM_PRESENT; > >> > page = vm_normal_page(vma, addr, pte); > >> > } else if (is_swap_pte(pte)) { > >> >- swp_entry_t entry = pte_to_swp_entry(pte); > >> >- > >> >+ swp_entry_t entry; > >> >+ if (pte_swp_soft_dirty(pte)) > >> >+ flags2 |= __PM_SOFT_DIRTY; > >> >+ entry = pte_to_swp_entry(pte); > >> > frame = swp_type(entry) | > >> > (swp_offset(entry) << MAX_SWAPFILES_SHIFT); > >> > flags = PM_SWAP; > >> >Index: linux-2.6.git/include/asm-generic/pgtable.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/include/asm-generic/pgtable.h > >> >+++ linux-2.6.git/include/asm-generic/pgtable.h > >> >@@ -417,6 +417,21 @@ static inline pmd_t pmd_mksoft_dirty(pmd > >> > { > >> > return pmd; > >> > } > >> >+ > >> >+static inline pte_t pte_swp_mksoft_dirty(pte_t pte) > >> >+{ > >> >+ return pte; > >> >+} > >> >+ > >> >+static inline int pte_swp_soft_dirty(pte_t pte) > >> >+{ > >> >+ return 0; > >> >+} > >> >+ > >> >+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) > >> >+{ > >> >+ return pte; > >> >+} > >> > #endif > >> > > >> > #ifndef __HAVE_PFNMAP_TRACKING > >> >Index: linux-2.6.git/include/linux/swapops.h > >> >=================================================================== > >> >--- linux-2.6.git.orig/include/linux/swapops.h > >> >+++ linux-2.6.git/include/linux/swapops.h > >> >@@ -67,6 +67,8 @@ static inline swp_entry_t pte_to_swp_ent > >> > swp_entry_t arch_entry; > >> > > >> > BUG_ON(pte_file(pte)); > >> >+ if (pte_swp_soft_dirty(pte)) > >> >+ pte = pte_swp_clear_soft_dirty(pte); > >> > arch_entry = __pte_to_swp_entry(pte); > >> > return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); > >> > } > >> >Index: linux-2.6.git/mm/memory.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/mm/memory.c > >> >+++ linux-2.6.git/mm/memory.c > >> >@@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct > >> > exclusive = 1; > >> > } > >> > flush_icache_page(vma, page); > >> >+ if (pte_swp_soft_dirty(orig_pte)) > >> >+ pte = pte_mksoft_dirty(pte); > >> > >> entry = pte_to_swp_entry(orig_pte); > >> orig_pte's _PTE_SWP_SOFT_DIRTY bit has already been cleared. > > > >You seem to walk same way with me. > >Please look at my stupid questions in this thread. > > > > I see your discussion with Cyrill, however, pte_to_swp_entry and pte_swp_soft_dirty > both against orig_pte, where I miss? ;-) pte_to_swp_entry is passed orig_pte by vaule, not a pointer so although pte_to_swp_entry clear out _PTE_SWP_SOFT_DIRTY, it does it in local-copy. So orig_pte is never changed. > > >> > >> > set_pte_at(mm, address, page_table, pte); > >> > if (page == swapcache) > >> > do_page_add_anon_rmap(page, vma, address, exclusive); > >> >Index: linux-2.6.git/mm/rmap.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/mm/rmap.c > >> >+++ linux-2.6.git/mm/rmap.c > >> >@@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, > >> > swp_entry_to_pte(make_hwpoison_entry(page))); > >> > } else if (PageAnon(page)) { > >> > swp_entry_t entry = { .val = page_private(page) }; > >> >+ pte_t swp_pte; > >> > > >> > if (PageSwapCache(page)) { > >> > /* > >> >@@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, > >> > BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); > >> > entry = make_migration_entry(page, pte_write(pteval)); > >> > } > >> >- set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); > >> >+ swp_pte = swp_entry_to_pte(entry); > >> >+ if (pte_soft_dirty(pteval)) > >> >+ swp_pte = pte_swp_mksoft_dirty(swp_pte); > >> >+ set_pte_at(mm, address, pte, swp_pte); > >> > BUG_ON(pte_file(*pte)); > >> > } else if (IS_ENABLED(CONFIG_MIGRATION) && > >> > (TTU_ACTION(flags) == TTU_MIGRATION)) { > >> >Index: linux-2.6.git/mm/swapfile.c > >> >=================================================================== > >> >--- linux-2.6.git.orig/mm/swapfile.c > >> >+++ linux-2.6.git/mm/swapfile.c > >> >@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, > >> > } > >> > #endif /* CONFIG_HIBERNATION */ > >> > > >> >+static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) > >> >+{ > >> >+#ifdef CONFIG_MEM_SOFT_DIRTY > >> >+ /* > >> >+ * When pte keeps soft dirty bit the pte generated > >> >+ * from swap entry does not has it, still it's same > >> >+ * pte from logical point of view. > >> >+ */ > >> >+ pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); > >> >+ return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); > >> >+#else > >> >+ return pte_same(pte, swp_pte); > >> >+#endif > >> >+} > >> >+ > >> > /* > >> > * No need to decide whether this PTE shares the swap entry with others, > >> > * just let do_wp_page work it out if a write is requested later - to > >> >@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_stru > >> > } > >> > > >> > pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > >> >- if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { > >> >+ if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { > >> > mem_cgroup_cancel_charge_swapin(memcg); > >> > ret = 0; > >> > goto out; > >> >@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_are > >> > * swapoff spends a _lot_ of time in this loop! > >> > * Test inline before going to call unuse_pte. > >> > */ > >> >- if (unlikely(pte_same(*pte, swp_pte))) { > >> >+ if (unlikely(maybe_same_pte(*pte, swp_pte))) { > >> > pte_unmap(pte); > >> > ret = unuse_pte(vma, pmd, addr, entry, page); > >> > if (ret) > >> > > >> >-- > >> >To unsubscribe, send a message with 'unsubscribe linux-mm' in > >> >the body to majordomo@kvack.org. For more info on Linux MM, > >> >see: http://www.linux-mm.org/ . > >> >Don't email: email@kvack.org > >> > >> -- > >> To unsubscribe, send a message with 'unsubscribe linux-mm' in > >> the body to majordomo@kvack.org. For more info on Linux MM, > >> see: http://www.linux-mm.org/ . > >> Don't email: email@kvack.org > > > >-- > >Kind regards, > >Minchan Kim > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- Kind regards, Minchan Kim From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754530Ab3HEFnk (ORCPT ); Mon, 5 Aug 2013 01:43:40 -0400 Received: from mail-lb0-f173.google.com ([209.85.217.173]:32782 "EHLO mail-lb0-f173.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754135Ab3HEFnj (ORCPT ); Mon, 5 Aug 2013 01:43:39 -0400 Date: Mon, 5 Aug 2013 09:43:35 +0400 From: Cyrill Gorcunov To: Wanpeng Li Cc: Minchan Kim , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, akpm@linux-foundation.org, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130805054335.GC7999@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <51ff047d.2768310a.2fc4.340fSMTPIN_ADDED_BROKEN@mx.google.com> <20130805021715.GJ32486@bbox> <51ff1053.ab47310a.5d3f.566cSMTPIN_ADDED_BROKEN@mx.google.com> <20130805025437.GK32486@bbox> <51ff14e9.87ef440a.1424.ffffe470SMTPIN_ADDED_BROKEN@mx.google.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51ff14e9.87ef440a.1424.ffffe470SMTPIN_ADDED_BROKEN@mx.google.com> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Mon, Aug 05, 2013 at 10:58:35AM +0800, Wanpeng Li wrote: > > > >pte_to_swp_entry is passed orig_pte by vaule, not a pointer > >so although pte_to_swp_entry clear out _PTE_SWP_SOFT_DIRTY, it does it in local-copy. > >So orig_pte is never changed. > > Ouch! Thanks for pointing out. ;-) > > Reviewed-by: Wanpeng Li Yeah, it's a bit tricky. Thanks. From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932699Ab3HGUV6 (ORCPT ); Wed, 7 Aug 2013 16:21:58 -0400 Received: from mail.linuxfoundation.org ([140.211.169.12]:39757 "EHLO mail.linuxfoundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756928Ab3HGUV5 (ORCPT ); Wed, 7 Aug 2013 16:21:57 -0400 Date: Wed, 7 Aug 2013 13:21:56 -0700 From: Andrew Morton To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-Id: <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> In-Reply-To: <20130730204654.844299768@gmail.com> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> X-Mailer: Sylpheed 3.2.0beta5 (GTK+ 2.24.10; x86_64-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Wed, 31 Jul 2013 00:41:55 +0400 Cyrill Gorcunov wrote: > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > bit set get swapped out, the bit is getting lost and no longer > available when pte read back. > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > saved in pte entry for the page being swapped out. When such page > is to be read back from a swap cache we check for bit presence > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > bit back. > > One of the problem was to find a place in pte entry where we can > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > _PAGE_PSE was chosen for that, it doesn't intersect with swap > entry format stored in pte. So the implication is that if another architecture wants to support this (and, realistically, wants to support CRIU), that architecture must find a spare pte bit to implement _PTE_SWP_SOFT_DIRTY. Yes? From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933110Ab3HGU2P (ORCPT ); Wed, 7 Aug 2013 16:28:15 -0400 Received: from mail.linuxfoundation.org ([140.211.169.12]:39781 "EHLO mail.linuxfoundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932343Ab3HGU2O (ORCPT ); Wed, 7 Aug 2013 16:28:14 -0400 Date: Wed, 7 Aug 2013 13:28:12 -0700 From: Andrew Morton To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-Id: <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> In-Reply-To: <20130730204654.966378702@gmail.com> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> X-Mailer: Sylpheed 3.2.0beta5 (GTK+ 2.24.10; x86_64-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Wed, 31 Jul 2013 00:41:56 +0400 Cyrill Gorcunov wrote: > +#define pte_to_pgoff(pte) \ > + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ > + & ((1U << PTE_FILE_BITS1) - 1))) \ > + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ > + & ((1U << PTE_FILE_BITS2) - 1)) \ > + << (PTE_FILE_BITS1)) \ > + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ > + & ((1U << PTE_FILE_BITS3) - 1)) \ > + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ > + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ > + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) > + > +#define pgoff_to_pte(off) \ > + ((pte_t) { .pte_low = \ > + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ > + + ((((off) >> PTE_FILE_BITS1) \ > + & ((1U << PTE_FILE_BITS2) - 1)) \ > + << PTE_FILE_SHIFT2) \ > + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ > + & ((1U << PTE_FILE_BITS3) - 1)) \ > + << PTE_FILE_SHIFT3) \ > + + ((((off) >> \ > + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ > + << PTE_FILE_SHIFT4) \ > + + _PAGE_FILE }) Good god. I wonder if these can be turned into out-of-line functions in some form which humans can understand. or #define pte_to_pgoff(pte) frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933160Ab3HGU3S (ORCPT ); Wed, 7 Aug 2013 16:29:18 -0400 Received: from mail-lb0-f178.google.com ([209.85.217.178]:65128 "EHLO mail-lb0-f178.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932350Ab3HGU3R (ORCPT ); Wed, 7 Aug 2013 16:29:17 -0400 Date: Thu, 8 Aug 2013 00:29:14 +0400 From: Cyrill Gorcunov To: Andrew Morton Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages Message-ID: <20130807202914.GO7999@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Wed, Aug 07, 2013 at 01:21:56PM -0700, Andrew Morton wrote: > > > > One of the problem was to find a place in pte entry where we can > > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > > _PAGE_PSE was chosen for that, it doesn't intersect with swap > > entry format stored in pte. > > So the implication is that if another architecture wants to support > this (and, realistically, wants to support CRIU), that architecture > must find a spare pte bit to implement _PTE_SWP_SOFT_DIRTY. Yes? Exactly. From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933176Ab3HGUbJ (ORCPT ); Wed, 7 Aug 2013 16:31:09 -0400 Received: from mail-la0-f52.google.com ([209.85.215.52]:51701 "EHLO mail-la0-f52.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933014Ab3HGUbH (ORCPT ); Wed, 7 Aug 2013 16:31:07 -0400 Date: Thu, 8 Aug 2013 00:31:03 +0400 From: Cyrill Gorcunov To: Andrew Morton Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130807203103.GP7999@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Wed, Aug 07, 2013 at 01:28:12PM -0700, Andrew Morton wrote: > > Good god. > > I wonder if these can be turned into out-of-line functions in some form > which humans can understand. > > or > > #define pte_to_pgoff(pte) > frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + > frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + > frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + > frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) I copied this code from existing one, not mine invention ;) I'll clean it up on top. From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758645Ab3HJRs2 (ORCPT ); Sat, 10 Aug 2013 13:48:28 -0400 Received: from bedivere.hansenpartnership.com ([66.63.167.143]:48307 "EHLO bedivere.hansenpartnership.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752701Ab3HJRs0 (ORCPT ); Sat, 10 Aug 2013 13:48:26 -0400 Message-ID: <1376156903.2156.30.camel@dabdike.int.hansenpartnership.com> Subject: Re: [patch 1/2] [PATCH] mm: Save soft-dirty bits on swapped pages From: James Bottomley To: Andrew Morton Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, gorcunov@openvz.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com Date: Sat, 10 Aug 2013 10:48:23 -0700 In-Reply-To: <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> References: <20130730204154.407090410@gmail.com> <20130730204654.844299768@gmail.com> <20130807132156.e97bbcc3d543cf88d5a0997d@linux-foundation.org> Content-Type: text/plain; charset="ISO-8859-15" X-Mailer: Evolution 3.8.3 Mime-Version: 1.0 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Wed, 2013-08-07 at 13:21 -0700, Andrew Morton wrote: > On Wed, 31 Jul 2013 00:41:55 +0400 Cyrill Gorcunov wrote: > > > Andy Lutomirski reported that in case if a page with _PAGE_SOFT_DIRTY > > bit set get swapped out, the bit is getting lost and no longer > > available when pte read back. > > > > To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is > > saved in pte entry for the page being swapped out. When such page > > is to be read back from a swap cache we check for bit presence > > and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY > > bit back. > > > > One of the problem was to find a place in pte entry where we can > > save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The > > _PAGE_PSE was chosen for that, it doesn't intersect with swap > > entry format stored in pte. > > So the implication is that if another architecture wants to support > this (and, realistically, wants to support CRIU), To be clear, CRIU is usable for basic checkpoint/restore without soft dirty. It's using CRIU as an engine for process migration between nodes that won't work efficiently without soft dirty. What happens without soft dirty is that we have to freeze the source process state, transfer the bits and then begin execution on the target ... that means the process can be suspended for minutes (and means that customers notice and your SLAs get blown). Using soft dirty, we can iteratively build up the process image on the target while the source process is still executing meaning the actual transfer between source and target takes only seconds (when the delta is small enough, we freeze the source, transfer the remaining changed bits and begin on the target). James From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755624Ab3HLV5Y (ORCPT ); Mon, 12 Aug 2013 17:57:24 -0400 Received: from mail.linuxfoundation.org ([140.211.169.12]:57251 "EHLO mail.linuxfoundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754815Ab3HLV5X (ORCPT ); Mon, 12 Aug 2013 17:57:23 -0400 Date: Mon, 12 Aug 2013 14:57:20 -0700 From: Andrew Morton To: Cyrill Gorcunov Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org, luto@amacapital.net, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-Id: <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> In-Reply-To: <20130808145120.GA1775@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> X-Mailer: Sylpheed 3.2.0beta5 (GTK+ 2.24.10; x86_64-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, 8 Aug 2013 18:51:20 +0400 Cyrill Gorcunov wrote: > On Wed, Aug 07, 2013 at 01:28:12PM -0700, Andrew Morton wrote: > > > > Good god. > > > > I wonder if these can be turned into out-of-line functions in some form > > which humans can understand. > > > > or > > > > #define pte_to_pgoff(pte) > > frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + > > frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + > > frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + > > frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) > > Hi, here is what I ended up with. Please take a look (I decided to post > patch in the thread since it's related to the context of the mails). You could have #undefed _mfrob and __frob after using them, but whatever. I saved this patch to wave at the x86 guys for 3.12. I plan to merge mm-save-soft-dirty-bits-on-file-pages.patch for 3.11. > Guys, is there a reason for "if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE" > test present in this pgtable-2level.h file at all? I can't imagine > where it can be false on x86. I doubt if "Guys" read this. x86 maintainers cc'ed. From: Cyrill Gorcunov Subject: arch/x86/include/asm/pgtable-2level.h: clean up pte_to_pgoff and pgoff_to_pte helpers Andrew asked if there a way to make pte_to_pgoff and pgoff_to_pte macro helpers somehow more readable. With this patch it should be more understandable what is happening with bits when they come to and from pte entry. Signed-off-by: Cyrill Gorcunov Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable-2level.h | 82 ++++++++++++------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff -puN arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers arch/x86/include/asm/pgtable-2level.h --- a/arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers +++ a/arch/x86/include/asm/pgtable-2level.h @@ -55,6 +55,9 @@ static inline pmd_t native_pmdp_get_and_ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) +#define __frob(v,r,l) (((v) >> (r)) << (l)) + #ifdef CONFIG_MEM_SOFT_DIRTY /* @@ -71,31 +74,27 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ - & ((1U << PTE_FILE_BITS1) - 1))) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << (PTE_FILE_BITS1)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) \ - & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - & ((1U << PTE_FILE_BITS3) - 1)) \ - << PTE_FILE_SHIFT3) \ - + ((((off) >> \ - (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ - << PTE_FILE_SHIFT4) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) +#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) +#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) + +#define pte_to_pgoff(pte) \ + (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + \ + __frob((pte).pte_low, PTE_FILE_SHIFT4, PTE_FILE_LSHIFT4)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + _mfrob(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + \ + __frob(off, PTE_FILE_LSHIFT4, PTE_FILE_SHIFT4) + \ + _PAGE_FILE }) #else /* CONFIG_MEM_SOFT_DIRTY */ @@ -115,22 +114,23 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> PTE_FILE_SHIFT1) \ - & ((1U << PTE_FILE_BITS1) - 1)) \ - + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ - & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ - + (((pte).pte_low >> PTE_FILE_SHIFT3) \ - << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) - -#define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = \ - (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ - + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ - << PTE_FILE_SHIFT2) \ - + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ - << PTE_FILE_SHIFT3) \ - + _PAGE_FILE }) +#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) +#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) + +#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) +#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) + +#define pte_to_pgoff(pte) \ + (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + __frob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_LSHIFT3)) + +#define pgoff_to_pte(off) \ + ((pte_t) { .pte_low = \ + _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + __frob(off, PTE_FILE_LSHIFT3, PTE_FILE_SHIFT3) + \ + _PAGE_FILE }) #endif /* CONFIG_MEM_SOFT_DIRTY */ _ From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755773Ab3HLW2i (ORCPT ); Mon, 12 Aug 2013 18:28:38 -0400 Received: from mail-ve0-f176.google.com ([209.85.128.176]:62763 "EHLO mail-ve0-f176.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752976Ab3HLW2h (ORCPT ); Mon, 12 Aug 2013 18:28:37 -0400 MIME-Version: 1.0 In-Reply-To: <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> From: Andy Lutomirski Date: Mon, 12 Aug 2013 15:28:06 -0700 Message-ID: Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages To: Andrew Morton Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner Content-Type: text/plain; charset=ISO-8859-1 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Mon, Aug 12, 2013 at 2:57 PM, Andrew Morton wrote: > On Thu, 8 Aug 2013 18:51:20 +0400 Cyrill Gorcunov wrote: > >> On Wed, Aug 07, 2013 at 01:28:12PM -0700, Andrew Morton wrote: >> > >> > Good god. >> > >> > I wonder if these can be turned into out-of-line functions in some form >> > which humans can understand. >> > >> > or >> > >> > #define pte_to_pgoff(pte) >> > frob(pte, PTE_FILE_SHIFT1, PTE_FILE_BITS1) + >> > frob(PTE_FILE_SHIFT2, PTE_FILE_BITS2) + >> > frob(PTE_FILE_SHIFT3, PTE_FILE_BITS3) + >> > frob(PTE_FILE_SHIFT4, PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) >> >> Hi, here is what I ended up with. Please take a look (I decided to post >> patch in the thread since it's related to the context of the mails). > > You could have #undefed _mfrob and __frob after using them, but whatever. > > I saved this patch to wave at the x86 guys for 3.12. I plan to merge > mm-save-soft-dirty-bits-on-file-pages.patch for 3.11. > >> Guys, is there a reason for "if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE" >> test present in this pgtable-2level.h file at all? I can't imagine >> where it can be false on x86. > > I doubt if "Guys" read this. x86 maintainers cc'ed. > > > > > > From: Cyrill Gorcunov > Subject: arch/x86/include/asm/pgtable-2level.h: clean up pte_to_pgoff and pgoff_to_pte helpers > > Andrew asked if there a way to make pte_to_pgoff and pgoff_to_pte macro > helpers somehow more readable. > > With this patch it should be more understandable what is happening with > bits when they come to and from pte entry. > > Signed-off-by: Cyrill Gorcunov > Cc: Ingo Molnar > Cc: "H. Peter Anvin" > Cc: Thomas Gleixner > Signed-off-by: Andrew Morton > --- > > arch/x86/include/asm/pgtable-2level.h | 82 ++++++++++++------------ > 1 file changed, 41 insertions(+), 41 deletions(-) > > diff -puN arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers arch/x86/include/asm/pgtable-2level.h > --- a/arch/x86/include/asm/pgtable-2level.h~arch-x86-include-asm-pgtable-2levelh-clean-up-pte_to_pgoff-and-pgoff_to_pte-helpers > +++ a/arch/x86/include/asm/pgtable-2level.h > @@ -55,6 +55,9 @@ static inline pmd_t native_pmdp_get_and_ > #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) > #endif > > +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) > +#define __frob(v,r,l) (((v) >> (r)) << (l)) > + > #ifdef CONFIG_MEM_SOFT_DIRTY > If I'm understanding this right, the idea is to take the bits in the range a..b of v and stick them at c..d, where a-b == c-d. Would it make sense to change this to look something like #define __frob(v, inmsb, inlsb, outlsb) ((v >> inlsb) & ((1<<(inmsb - inlsb + 1)-1) << outlsb) For extra fun, there could be an __unfrob macro that takes the same inmsg, inlsb, outlsb parameters but undoes it so that it's (more) clear that the operations that are supposed to be inverses are indeed inverses. --Andy From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755843Ab3HLWh2 (ORCPT ); Mon, 12 Aug 2013 18:37:28 -0400 Received: from mail.linuxfoundation.org ([140.211.169.12]:57537 "EHLO mail.linuxfoundation.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754765Ab3HLWh1 (ORCPT ); Mon, 12 Aug 2013 18:37:27 -0400 Date: Mon, 12 Aug 2013 15:37:25 -0700 From: Andrew Morton To: Andy Lutomirski Cc: Cyrill Gorcunov , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-Id: <20130812153725.6ac5135a86994e4d766723f9@linux-foundation.org> In-Reply-To: References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> X-Mailer: Sylpheed 3.2.0beta5 (GTK+ 2.24.10; x86_64-pc-linux-gnu) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Mon, 12 Aug 2013 15:28:06 -0700 Andy Lutomirski wrote: > > +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) > > +#define __frob(v,r,l) (((v) >> (r)) << (l)) > > + > > #ifdef CONFIG_MEM_SOFT_DIRTY > > > > If I'm understanding this right, the idea is to take the bits in the > range a..b of v and stick them at c..d, where a-b == c-d. Would it > make sense to change this to look something like > > #define __frob(v, inmsb, inlsb, outlsb) ((v >> inlsb) & ((1<<(inmsb - > inlsb + 1)-1) << outlsb) > > For extra fun, there could be an __unfrob macro that takes the same > inmsg, inlsb, outlsb parameters but undoes it so that it's (more) > clear that the operations that are supposed to be inverses are indeed > inverses. hm, I seem to remember writing drivers/net/ethernet/3com/3c59x.c:BFINS() and BFEXT() shortly after the invention of the electronic computer. I'm kinda surprised that we don't already have something like this in kernel.h or somewhere - there's surely a ton of code which does such things. From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752298Ab3HMFCS (ORCPT ); Tue, 13 Aug 2013 01:02:18 -0400 Received: from mail-la0-f41.google.com ([209.85.215.41]:36445 "EHLO mail-la0-f41.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750793Ab3HMFCQ (ORCPT ); Tue, 13 Aug 2013 01:02:16 -0400 Date: Tue, 13 Aug 2013 09:02:13 +0400 From: Cyrill Gorcunov To: Andy Lutomirski Cc: Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , "H. Peter Anvin" , Thomas Gleixner Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130813050213.GA2869@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Mon, Aug 12, 2013 at 03:28:06PM -0700, Andy Lutomirski wrote: > > > > You could have #undefed _mfrob and __frob after using them, but whatever. Sure, for some reason I forgot to do that. Will send update on top. > > I saved this patch to wave at the x86 guys for 3.12. I plan to merge > > mm-save-soft-dirty-bits-on-file-pages.patch for 3.11. > > > >> Guys, is there a reason for "if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE" > >> test present in this pgtable-2level.h file at all? I can't imagine > >> where it can be false on x86. > > > > I doubt if "Guys" read this. x86 maintainers cc'ed. Thanks! > > +#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) > > +#define __frob(v,r,l) (((v) >> (r)) << (l)) > > + > > #ifdef CONFIG_MEM_SOFT_DIRTY > > If I'm understanding this right, the idea is to take the bits in the > range a..b of v and stick them at c..d, where a-b == c-d. Would it > make sense to change this to look something like > > #define __frob(v, inmsb, inlsb, outlsb) ((v >> inlsb) & ((1<<(inmsb - > inlsb + 1)-1) << outlsb) There is a case when you don't need a mask completely. And because this pte conversion is on hot path and time critical I kept generated code as it was (even if that lead to slightly less clear source code). > For extra fun, there could be an __unfrob macro that takes the same > inmsg, inlsb, outlsb parameters but undoes it so that it's (more) > clear that the operations that are supposed to be inverses are indeed > inverses. From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758259Ab3HMPP7 (ORCPT ); Tue, 13 Aug 2013 11:15:59 -0400 Received: from terminus.zytor.com ([198.137.202.10]:33216 "EHLO mail.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758185Ab3HMPP6 (ORCPT ); Tue, 13 Aug 2013 11:15:58 -0400 Message-ID: <520A4D5F.6020401@zytor.com> Date: Tue, 13 Aug 2013 08:14:39 -0700 From: "H. Peter Anvin" User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130625 Thunderbird/17.0.7 MIME-Version: 1.0 To: Cyrill Gorcunov CC: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> In-Reply-To: <20130813050213.GA2869@moon> X-Enigmail-Version: 1.5.2 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On 08/12/2013 10:02 PM, Cyrill Gorcunov wrote: > > There is a case when you don't need a mask completely. And because this > pte conversion is on hot path and time critical I kept generated code > as it was (even if that lead to slightly less clear source code). > Does it actually matter, generated-code-wise, or is the compiler smart enough to figure it out? The reason I'm asking is because it makes the code much harder to follow. The other thing is can we please pretty please call it something other than "frob"? -hpa From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758183Ab3HMPhI (ORCPT ); Tue, 13 Aug 2013 11:37:08 -0400 Received: from mail-la0-f50.google.com ([209.85.215.50]:45238 "EHLO mail-la0-f50.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1757619Ab3HMPhG (ORCPT ); Tue, 13 Aug 2013 11:37:06 -0400 Date: Tue, 13 Aug 2013 19:37:03 +0400 From: Cyrill Gorcunov To: "H. Peter Anvin" Cc: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130813153703.GE2869@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> <520A4D5F.6020401@zytor.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <520A4D5F.6020401@zytor.com> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Tue, Aug 13, 2013 at 08:14:39AM -0700, H. Peter Anvin wrote: > On 08/12/2013 10:02 PM, Cyrill Gorcunov wrote: > > > > There is a case when you don't need a mask completely. And because this > > pte conversion is on hot path and time critical I kept generated code > > as it was (even if that lead to slightly less clear source code). > > > > Does it actually matter, generated-code-wise, or is the compiler smart > enough to figure it out? The reason I'm asking is because it makes the gcc-4.7.2 is smart enough to suppress useless masking (ie ((1u << 31) - 1)) completely but I don't know if this can be assumed for all gcc series. > code much harder to follow. I see. OK, I'll try to prepare more readable macro helpers. > > The other thing is can we please pretty please call it something other > than "frob"? Sure. From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757329Ab3HMQoY (ORCPT ); Tue, 13 Aug 2013 12:44:24 -0400 Received: from terminus.zytor.com ([198.137.202.10]:34411 "EHLO mail.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756163Ab3HMQoX (ORCPT ); Tue, 13 Aug 2013 12:44:23 -0400 Message-ID: <520A622B.7020900@zytor.com> Date: Tue, 13 Aug 2013 09:43:23 -0700 From: "H. Peter Anvin" User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130625 Thunderbird/17.0.7 MIME-Version: 1.0 To: Cyrill Gorcunov CC: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> <520A4D5F.6020401@zytor.com> <20130813153703.GE2869@moon> In-Reply-To: <20130813153703.GE2869@moon> X-Enigmail-Version: 1.5.2 Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On 08/13/2013 08:37 AM, Cyrill Gorcunov wrote: >> >> Does it actually matter, generated-code-wise, or is the compiler smart >> enough to figure it out? The reason I'm asking is because it makes the > > gcc-4.7.2 is smart enough to suppress useless masking (ie ((1u << 31) - 1)) > completely but I don't know if this can be assumed for all gcc series. > I would be highly surprised if it wasn't the case for any gcc we care about. -hpa From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S932225Ab3HMV3D (ORCPT ); Tue, 13 Aug 2013 17:29:03 -0400 Received: from mail-lb0-f173.google.com ([209.85.217.173]:44743 "EHLO mail-lb0-f173.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932070Ab3HMV3B (ORCPT ); Tue, 13 Aug 2013 17:29:01 -0400 Date: Wed, 14 Aug 2013 01:28:57 +0400 From: Cyrill Gorcunov To: "H. Peter Anvin" Cc: Andy Lutomirski , Andrew Morton , linux-mm@kvack.org, linux-kernel@vger.kernel.org, xemul@parallels.com, mpm@selenic.com, xiaoguangrong@linux.vnet.ibm.com, mtosatti@redhat.com, kosaki.motohiro@gmail.com, sfr@canb.auug.org.au, peterz@infradead.org, aneesh.kumar@linux.vnet.ibm.com, Ingo Molnar , Thomas Gleixner Subject: Re: [patch 2/2] [PATCH] mm: Save soft-dirty bits on file pages Message-ID: <20130813212857.GI2869@moon> References: <20130730204154.407090410@gmail.com> <20130730204654.966378702@gmail.com> <20130807132812.60ad4bfe85127794094d385e@linux-foundation.org> <20130808145120.GA1775@moon> <20130812145720.3b722b066fe1bd77291331e5@linux-foundation.org> <20130813050213.GA2869@moon> <520A4D5F.6020401@zytor.com> <20130813153703.GE2869@moon> <520A622B.7020900@zytor.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <520A622B.7020900@zytor.com> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Tue, Aug 13, 2013 at 09:43:23AM -0700, H. Peter Anvin wrote: > On 08/13/2013 08:37 AM, Cyrill Gorcunov wrote: > >> > >> Does it actually matter, generated-code-wise, or is the compiler smart > >> enough to figure it out? The reason I'm asking is because it makes the > > > > gcc-4.7.2 is smart enough to suppress useless masking (ie ((1u << 31) - 1)) > > completely but I don't know if this can be assumed for all gcc series. > > > > I would be highly surprised if it wasn't the case for any gcc we care about. Does below one looks better? (Btw, what about the snippet we have there as well #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) #else #define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) #define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) #endif where #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL -> 8 #define _PAGE_BIT_FILE _PAGE_BIT_DIRTY -> 6 so I wonder where the cases on x86 when _PAGE_BIT_FILE > _PAGE_BIT_PROTNONE, what i'm missing here?) --- arch/x86/include/asm/pgtable-2level.h | 37 +++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) Index: linux-2.6.git/arch/x86/include/asm/pgtable-2level.h =================================================================== --- linux-2.6.git.orig/arch/x86/include/asm/pgtable-2level.h +++ linux-2.6.git/arch/x86/include/asm/pgtable-2level.h @@ -55,8 +55,11 @@ static inline pmd_t native_pmdp_get_and_ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif -#define _mfrob(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) -#define __frob(v,r,l) (((v) >> (r)) << (l)) +/* + * For readable bitfield manipulations. + */ +#define PTE_FILE_NOMASK (-1U) +#define __bfop(v,r,m,l) ((((v) >> (r)) & (m)) << (l)) #ifdef CONFIG_MEM_SOFT_DIRTY @@ -83,17 +86,17 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) #define pte_to_pgoff(pte) \ - (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ - _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ - _mfrob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + \ - __frob((pte).pte_low, PTE_FILE_SHIFT4, PTE_FILE_LSHIFT4)) + (__bfop((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT4, PTE_FILE_NOMASK, PTE_FILE_LSHIFT4)) #define pgoff_to_pte(off) \ ((pte_t) { .pte_low = \ - _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ - _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ - _mfrob(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + \ - __frob(off, PTE_FILE_LSHIFT4, PTE_FILE_SHIFT4) + \ + __bfop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + __bfop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + __bfop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + \ + __bfop(off, PTE_FILE_LSHIFT4, PTE_FILE_NOMASK, PTE_FILE_SHIFT4) + \ _PAGE_FILE }) #else /* CONFIG_MEM_SOFT_DIRTY */ @@ -121,19 +124,21 @@ static inline pmd_t native_pmdp_get_and_ #define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) #define pte_to_pgoff(pte) \ - (_mfrob((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ - _mfrob((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ - __frob((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_LSHIFT3)) + (__bfop((pte).pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + \ + __bfop((pte).pte_low, PTE_FILE_SHIFT3, PTE_FILE_NOMASK, PTE_FILE_LSHIFT3)) #define pgoff_to_pte(off) \ ((pte_t) { .pte_low = \ - _mfrob(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ - _mfrob(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ - __frob(off, PTE_FILE_LSHIFT3, PTE_FILE_SHIFT3) + \ + __bfop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + \ + __bfop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + \ + __bfop(off, PTE_FILE_LSHIFT3, PTE_FILE_NOMASK, PTE_FILE_SHIFT3) + \ _PAGE_FILE }) #endif /* CONFIG_MEM_SOFT_DIRTY */ +#undef __bfop + /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)