From: Peter Zijlstra People expressed the need to track dirty pages in shared mappings. Linus outlined the general idea of doing that through making clean writable pages write-protected and taking the write fault. This patch does exactly that, it makes pages in a shared writable mapping write-protected. On write-fault the pages are marked dirty and made writable. When the pages get synced with their backing store, the write-protection is re-instated. It survives a simple test and shows the dirty pages in /proc/vmstat. Changes in -v2: - only wrprotect pages from dirty capable mappings. (Nick Piggin) - move the writefault handling from do_wp_page() into handle_pte_fault(). (Nick Piggin) - revert to the old install_page interface. (Nick Piggin) - also clear the pte dirty bit when we make pages read-only again. (spotted by Rik van Riel) - make page_wrprotect() return the number of reprotected ptes. Signed-off-by: Peter Zijlstra include/linux/mm.h | 2 + include/linux/rmap.h | 6 ++++ mm/fremap.c | 10 ++++++- mm/memory.c | 24 ++++++++++++++++-- mm/page-writeback.c | 9 +++++- mm/rmap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 7 deletions(-) Index: linux-2.6/include/linux/mm.h =================================================================== --- linux-2.6.orig/include/linux/mm.h 2006-05-06 16:45:24.000000000 +0200 +++ linux-2.6/include/linux/mm.h 2006-05-06 16:51:01.000000000 +0200 @@ -183,6 +183,8 @@ extern unsigned int kobjsize(const void #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) +#define VM_SharedWritable(v) (((v)->vm_flags & (VM_SHARED|VM_WRITE)) == (VM_SHARED|VM_WRITE)) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c 2006-05-06 16:45:24.000000000 +0200 +++ linux-2.6/mm/fremap.c 2006-05-06 16:51:01.000000000 +0200 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -79,9 +80,14 @@ int install_page(struct mm_struct *mm, s inc_mm_counter(mm, file_rss); flush_icache_page(vma, page); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + pte_val = mk_pte(page, prot); + if (VM_SharedWritable(vma)) { + struct address_space *mapping = page_mapping(page); + if (mapping && mapping_cap_account_dirty(mapping)) + pte_val = pte_wrprotect(pte_val); + } + set_pte_at(mm, addr, pte, pte_val); page_add_file_rmap(page); - pte_val = *pte; update_mmu_cache(vma, addr, pte_val); err = 0; unlock: Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2006-05-06 16:45:24.000000000 +0200 +++ linux-2.6/mm/memory.c 2006-05-06 17:20:57.000000000 +0200 @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -2150,6 +2151,11 @@ retry: entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (VM_SharedWritable(vma)) { + struct address_space *mapping = page_mapping(new_page); + if (mapping && mapping_cap_account_dirty(mapping)) + entry = pte_wrprotect(entry); + } set_pte_at(mm, address, page_table, entry); if (anon) { inc_mm_counter(mm, anon_rss); @@ -2159,6 +2165,8 @@ retry: } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); + if (write_access) + set_page_dirty(new_page); } } else { /* One of our sibling threads was faster, back out. */ @@ -2257,12 +2265,22 @@ static inline int handle_pte_fault(struc if (unlikely(!pte_same(*pte, entry))) goto unlock; if (write_access) { - if (!pte_write(entry)) - return do_wp_page(mm, vma, address, - pte, pmd, ptl, entry); + if (!pte_write(entry)) { + if (!VM_SharedWritable(vma)) { + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); + } else { + struct page *page; + entry = pte_mkwrite(entry); + page = vm_normal_page(vma, address, entry); + if (page) + set_page_dirty(page); + } + } entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); + if (!pte_same(old_entry, entry)) { ptep_set_access_flags(vma, address, pte, entry, write_access); update_mmu_cache(vma, address, entry); Index: linux-2.6/mm/page-writeback.c =================================================================== --- linux-2.6.orig/mm/page-writeback.c 2006-05-06 16:45:24.000000000 +0200 +++ linux-2.6/mm/page-writeback.c 2006-05-06 16:51:01.000000000 +0200 @@ -29,6 +29,7 @@ #include #include #include +#include /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -725,8 +726,10 @@ int test_clear_page_dirty(struct page *p page_index(page), PAGECACHE_TAG_DIRTY); write_unlock_irqrestore(&mapping->tree_lock, flags); - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { + page_wrprotect(page); dec_page_state(nr_dirty); + } return 1; } write_unlock_irqrestore(&mapping->tree_lock, flags); @@ -756,8 +759,10 @@ int clear_page_dirty_for_io(struct page if (mapping) { if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { + page_wrprotect(page); dec_page_state(nr_dirty); + } return 1; } return 0; Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2006-05-06 16:45:24.000000000 +0200 +++ linux-2.6/mm/rmap.c 2006-05-06 16:51:01.000000000 +0200 @@ -478,6 +478,72 @@ int page_referenced(struct page *page, i return referenced; } +static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte, entry; + spinlock_t *ptl; + int ret = 0; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + goto out; + + if (!pte_write(*pte)) + goto unlock; + + entry = pte_mkclean(pte_wrprotect(*pte)); + ptep_establish(vma, address, pte, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + ret = 1; + +unlock: + pte_unmap_unlock(pte, ptl); +out: + return ret; +} + +static int page_wrprotect_file(struct page *page) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = 0; + + BUG_ON(PageAnon(page)); + + spin_lock(&mapping->i_mmap_lock); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (VM_SharedWritable(vma)) + ret += page_wrprotect_one(page, vma); + } + + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int page_wrprotect(struct page *page) +{ + int ret = 0; + + BUG_ON(!PageLocked(page)); + + if (page_mapped(page) && page->mapping) { + if (!PageAnon(page)) + ret = page_wrprotect_file(page); + } + + return ret; +} + /** * page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to Index: linux-2.6/include/linux/rmap.h =================================================================== --- linux-2.6.orig/include/linux/rmap.h 2006-05-06 16:45:24.000000000 +0200 +++ linux-2.6/include/linux/rmap.h 2006-05-06 16:51:01.000000000 +0200 @@ -105,6 +105,12 @@ pte_t *page_check_address(struct page *, */ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +/* + * Used to writeprotect clean pages, in order to count nr_dirty for shared + * mappings + */ +int page_wrprotect(struct page *); + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0)