[RFC][PATCH] tracking dirty pages in shared mappings

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC][PATCH] tracking dirty pages in shared mappings
@ 2006-05-05 20:35 Peter Zijlstra
  2006-05-06 13:18 ` Nick Piggin
  0 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-05 20:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Rohit Seth, Andrew Morton, clameter, mbligh, hugh,
	riel, andrea, piggin, arjan, apw, mel, marcelo, anton, paulmck,
	linux-mm

People expressed the need to track dirty pages in shared mappings.
Linus outlined the general idea of doing that through making clean
writable pages write-protected and taking the write fault.

This patch does exactly that, it makes pages in a shared writable
mapping write-protected. On write-fault the pages are marked dirty and
made writable. When the pages get synced with their backing store, the
write-protection is re-instated.

It survives a simple test and shows the dirty pages in /proc/vmstat.

Comments?

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---

 include/linux/mm.h   |    5 +++-
 include/linux/rmap.h |    6 +++++
 mm/filemap.c         |    3 +-
 mm/fremap.c          |    9 +++++--
 mm/memory.c          |   16 +++++++++++++
 mm/page-writeback.c  |    2 +
 mm/rmap.c            |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c           |    2 -
 8 files changed, 98 insertions(+), 6 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-04 21:34:06.000000000 +0200
+++ linux-2.6/include/linux/mm.h	2006-05-05 19:07:58.000000000 +0200
@@ -183,6 +183,9 @@ extern unsigned int kobjsize(const void 
 #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
+#define VM_SharedWritable(v)		(((v)->vm_flags & (VM_SHARED | VM_MAYSHARE)) && \
+					 ((v)->vm_flags & VM_WRITE))
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
@@ -721,7 +724,7 @@ static inline void unmap_shared_mapping_
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
-extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
+extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, int wrprotect);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 
 #ifdef CONFIG_MMU
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2006-05-04 21:34:06.000000000 +0200
+++ linux-2.6/mm/filemap.c	2006-05-05 19:10:37.000000000 +0200
@@ -1627,7 +1627,8 @@ repeat:
 		return -ENOMEM;
 
 	if (page) {
-		err = install_page(mm, vma, addr, page, prot);
+		err = install_page(mm, vma, addr, page, prot,
+				VM_SharedWritable(vma));
 		if (err) {
 			page_cache_release(page);
 			return err;
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2006-05-04 21:34:06.000000000 +0200
+++ linux-2.6/mm/fremap.c	2006-05-04 22:33:49.000000000 +0200
@@ -49,7 +49,8 @@ static int zap_pte(struct mm_struct *mm,
  * previously existing mapping.
  */
 int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long addr, struct page *page, pgprot_t prot)
+		unsigned long addr, struct page *page, pgprot_t prot,
+		int wrprotect)
 {
 	struct inode *inode;
 	pgoff_t size;
@@ -79,9 +80,11 @@ int install_page(struct mm_struct *mm, s
 		inc_mm_counter(mm, file_rss);
 
 	flush_icache_page(vma, page);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	pte_val = mk_pte(page, prot);
+	if (wrprotect)
+		pte_val = pte_wrprotect(pte_val);
+	set_pte_at(mm, addr, pte, pte_val);
 	page_add_file_rmap(page);
-	pte_val = *pte;
 	update_mmu_cache(vma, addr, pte_val);
 	err = 0;
 unlock:
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-04 21:34:06.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-05 19:37:14.000000000 +0200
@@ -1495,6 +1495,18 @@ static int do_wp_page(struct mm_struct *
 		}
 	}
 
+	if (VM_SharedWritable(vma)) {
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
+		entry = pte_mkyoung(orig_pte);
+		entry = pte_mkwrite(pte_mkdirty(entry));
+		ptep_set_access_flags(vma, address, page_table, entry, 1);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+		ret |= VM_FAULT_WRITE;
+		set_page_dirty(old_page);
+		goto unlock;
+	}
+
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
@@ -2150,6 +2162,8 @@ retry:
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		else if (VM_SharedWritable(vma))
+			entry = pte_wrprotect(entry);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
@@ -2159,6 +2173,8 @@ retry:
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+			if (write_access)
+				set_page_dirty(new_page);
 		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2006-05-04 21:34:06.000000000 +0200
+++ linux-2.6/mm/shmem.c	2006-05-04 22:12:54.000000000 +0200
@@ -1270,7 +1270,7 @@ static int shmem_populate(struct vm_area
 		/* Page may still be null, but only if nonblock was set. */
 		if (page) {
 			mark_page_accessed(page);
-			err = install_page(mm, vma, addr, page, prot);
+			err = install_page(mm, vma, addr, page, prot, 0);
 			if (err) {
 				page_cache_release(page);
 				return err;
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2006-05-04 16:30:46.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2006-05-05 13:40:08.000000000 +0200
@@ -725,6 +725,7 @@ int test_clear_page_dirty(struct page *p
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
+			page_wrprotect(page);
 			if (mapping_cap_account_dirty(mapping))
 				dec_page_state(nr_dirty);
 			return 1;
@@ -756,6 +757,7 @@ int clear_page_dirty_for_io(struct page 
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
+			page_wrprotect(page);
 			if (mapping_cap_account_dirty(mapping))
 				dec_page_state(nr_dirty);
 			return 1;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2006-05-04 19:27:42.000000000 +0200
+++ linux-2.6/mm/rmap.c	2006-05-05 22:00:34.000000000 +0200
@@ -478,6 +478,67 @@ int page_referenced(struct page *page, i
 	return referenced;
 }
 
+static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte, entry;
+	spinlock_t *ptl;
+
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	pte = page_check_address(page, mm, address, &ptl);
+	if (!pte)
+		goto out;
+
+	if (!pte_write(*pte))
+		goto unlock;
+
+	entry = pte_wrprotect(*pte);
+	ptep_establish(vma, address, pte, entry);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return 0;
+}
+
+static int page_wrprotect_file(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	BUG_ON(PageAnon(page));
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if (VM_SharedWritable(vma))
+			page_wrprotect_one(page, vma);
+	}
+
+	spin_unlock(&mapping->i_mmap_lock);
+	return 0;
+}
+
+int page_wrprotect(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (page_mapped(page) && page->mapping) {
+		if (!PageAnon(page))
+			page_wrprotect_file(page);
+	}
+
+	return 0;
+}
+
 /**
  * page_set_anon_rmap - setup new anonymous rmap
  * @page:	the page to add the mapping to
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h	2006-05-05 14:02:45.000000000 +0200
+++ linux-2.6/include/linux/rmap.h	2006-05-05 14:03:04.000000000 +0200
@@ -105,6 +105,12 @@ pte_t *page_check_address(struct page *,
  */
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
+/*
+ * Used to writeprotect clean pages, in order to count nr_dirty for shared
+ * mappings
+ */
+int page_wrprotect(struct page *);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-05 20:35 [RFC][PATCH] tracking dirty pages in shared mappings Peter Zijlstra
@ 2006-05-06 13:18 ` Nick Piggin
  2006-05-06 13:34   ` Peter Zijlstra
  0 siblings, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-06 13:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Andi Kleen, Rohit Seth, Andrew Morton, clameter,
	mbligh, hugh, riel, andrea, arjan, apw, mel, marcelo, anton,
	paulmck, linux-mm

[-- Attachment #1: Type: text/plain, Size: 994 bytes --]

Peter Zijlstra wrote:

>People expressed the need to track dirty pages in shared mappings.
>Linus outlined the general idea of doing that through making clean
>writable pages write-protected and taking the write fault.
>
>This patch does exactly that, it makes pages in a shared writable
>mapping write-protected. On write-fault the pages are marked dirty and
>made writable. When the pages get synced with their backing store, the
>write-protection is re-instated.
>
>It survives a simple test and shows the dirty pages in /proc/vmstat.
>
>Comments?
>

Looks pretty good. Christoph and I were looking at ways to improve
performance impact of this, and skipping the extra work for particular
(eg. shmem) mappings might be a good idea?

Attached is a patch with a couple of things I've currently got.

In the long run, I'd like to be able to set_page_dirty and
balance_dirty_pages outside of both ptl and mmap_sem, for performance
reasons. That will require a reworking of arch code though :(



[-- Attachment #2: mm-track-dirty-mmap-fixes.patch --]
[-- Type: text/plain, Size: 6983 bytes --]

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-06 23:05:10.000000000 +1000
+++ linux-2.6/mm/memory.c	2006-05-06 23:13:16.000000000 +1000
@@ -48,6 +48,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -1466,18 +1467,6 @@ static int do_wp_page(struct mm_struct *
 		}
 	}
 
-	if (VM_SharedWritable(vma)) {
-		flush_cache_page(vma, address, pte_pfn(orig_pte));
-		entry = pte_mkyoung(orig_pte);
-		entry = pte_mkwrite(pte_mkdirty(entry));
-		ptep_set_access_flags(vma, address, page_table, entry, 1);
-		update_mmu_cache(vma, address, entry);
-		lazy_mmu_prot_update(entry);
-		ret |= VM_FAULT_WRITE;
-		set_page_dirty(old_page);
-		goto unlock;
-	}
-
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
@@ -2131,8 +2120,11 @@ retry:
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		else if (VM_SharedWritable(vma))
-			entry = pte_wrprotect(entry);
+		else if (VM_SharedWritable(vma)) {
+			struct address_space *mapping = page_mapping(new_page);
+			if (mapping && mapping_cap_account_dirty(mapping))
+				entry = pte_wrprotect(entry);
+		}
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
@@ -2241,12 +2233,22 @@ static inline int handle_pte_fault(struc
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (write_access) {
-		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+		if (!pte_write(entry)) {
+			if (!VM_SharedWritable(vma)) {
+				return do_wp_page(mm, vma, address,
+						pte, pmd, ptl, entry);
+			} else {
+				struct page *page;
+				entry = pte_mkwrite(entry);
+				page = vm_normal_page(vma, address, entry);
+				if (page)
+					set_page_dirty(page);
+			}
+		}
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
+
 	if (!pte_same(old_entry, entry)) {
 		ptep_set_access_flags(vma, address, pte, entry, write_access);
 		update_mmu_cache(vma, address, entry);
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-06 23:05:10.000000000 +1000
+++ linux-2.6/include/linux/mm.h	2006-05-06 23:06:17.000000000 +1000
@@ -183,8 +183,7 @@ extern unsigned int kobjsize(const void 
 #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
-#define VM_SharedWritable(v)		(((v)->vm_flags & (VM_SHARED | VM_MAYSHARE)) && \
-					 ((v)->vm_flags & VM_WRITE))
+#define VM_SharedWritable(v)		((v)->vm_flags & (VM_SHARED|VM_WRITE))
 
 /*
  * mapping from the currently active vm_flags protection bits (the
@@ -724,7 +723,7 @@ static inline void unmap_shared_mapping_
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
-extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, int wrprotect);
+extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 
 #ifdef CONFIG_MMU
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c	2006-05-06 23:05:10.000000000 +1000
+++ linux-2.6/mm/filemap.c	2006-05-06 23:06:17.000000000 +1000
@@ -1582,8 +1582,7 @@ repeat:
 		return -ENOMEM;
 
 	if (page) {
-		err = install_page(mm, vma, addr, page, prot,
-				VM_SharedWritable(vma));
+		err = install_page(mm, vma, addr, page, prot);
 		if (err) {
 			page_cache_release(page);
 			return err;
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2006-05-06 23:05:10.000000000 +1000
+++ linux-2.6/mm/fremap.c	2006-05-06 23:07:17.000000000 +1000
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -49,8 +50,7 @@ static int zap_pte(struct mm_struct *mm,
  * previously existing mapping.
  */
 int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long addr, struct page *page, pgprot_t prot,
-		int wrprotect)
+		unsigned long addr, struct page *page, pgprot_t prot)
 {
 	struct inode *inode;
 	pgoff_t size;
@@ -81,8 +81,11 @@ int install_page(struct mm_struct *mm, s
 
 	flush_icache_page(vma, page);
 	pte_val = mk_pte(page, prot);
-	if (wrprotect)
-		pte_val = pte_wrprotect(pte_val);
+	if (VM_SharedWritable(vma)) {
+		struct address_space *mapping = page_mapping(page);
+		if (mapping && mapping_cap_account_dirty(mapping))
+			pte_val = pte_wrprotect(pte_val);
+	}
 	set_pte_at(mm, addr, pte, pte_val);
 	page_add_file_rmap(page);
 	update_mmu_cache(vma, addr, pte_val);
Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c	2006-05-06 23:05:10.000000000 +1000
+++ linux-2.6/mm/shmem.c	2006-05-06 23:06:17.000000000 +1000
@@ -1270,7 +1270,7 @@ static int shmem_populate(struct vm_area
 		/* Page may still be null, but only if nonblock was set. */
 		if (page) {
 			mark_page_accessed(page);
-			err = install_page(mm, vma, addr, page, prot, 0);
+			err = install_page(mm, vma, addr, page, prot);
 			if (err) {
 				page_cache_release(page);
 				return err;
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2006-05-06 23:05:10.000000000 +1000
+++ linux-2.6/mm/page-writeback.c	2006-05-06 23:06:28.000000000 +1000
@@ -29,6 +29,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/rmap.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -725,9 +726,10 @@ int test_clear_page_dirty(struct page *p
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
-			page_wrprotect(page);
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -757,9 +759,10 @@ int clear_page_dirty_for_io(struct page 
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
-			page_wrprotect(page);
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		return 0;

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-06 13:18 ` Nick Piggin
@ 2006-05-06 13:34   ` Peter Zijlstra
  2006-05-06 13:47     ` Nick Piggin
  0 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-06 13:34 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linus Torvalds, Andi Kleen, Rohit Seth, Andrew Morton, clameter,
	mbligh, hugh, riel, andrea, arjan, apw, mel, marcelo, anton,
	paulmck, linux-mm

On Sat, 2006-05-06 at 23:18 +1000, Nick Piggin wrote:

> 
> Looks pretty good. Christoph and I were looking at ways to improve
> performance impact of this, and skipping the extra work for particular
> (eg. shmem) mappings might be a good idea?
> 
> Attached is a patch with a couple of things I've currently got.

will merge with mine and post a new version shortly.

> In the long run, I'd like to be able to set_page_dirty and
> balance_dirty_pages outside of both ptl and mmap_sem, for performance
> reasons. That will require a reworking of arch code though :(

That would indeed be very nice if possible.

> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h	2006-05-06 23:05:10.000000000 +1000
> +++ linux-2.6/include/linux/mm.h	2006-05-06 23:06:17.000000000 +1000
> @@ -183,8 +183,7 @@ extern unsigned int kobjsize(const void 
>  #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
>  #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
>  
> -#define VM_SharedWritable(v)		(((v)->vm_flags & (VM_SHARED | VM_MAYSHARE)) && \
> -					 ((v)->vm_flags & VM_WRITE))
> +#define VM_SharedWritable(v)		((v)->vm_flags & (VM_SHARED|VM_WRITE))
>  
>  /*
>   * mapping from the currently active vm_flags protection bits (the

That doesn't look right, your version is true even for unwritable
shared, and writable non-shared VMAs.

PeterZ

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-06 13:34   ` Peter Zijlstra
@ 2006-05-06 13:47     ` Nick Piggin
  2006-05-06 15:29       ` Peter Zijlstra
  0 siblings, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-06 13:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Andi Kleen, Rohit Seth, Andrew Morton, clameter,
	mbligh, hugh, riel, andrea, arjan, apw, mel, marcelo, anton,
	paulmck, linux-mm

Peter Zijlstra wrote:

>On Sat, 2006-05-06 at 23:18 +1000, Nick Piggin wrote:
>
>
>>Looks pretty good. Christoph and I were looking at ways to improve
>>performance impact of this, and skipping the extra work for particular
>>(eg. shmem) mappings might be a good idea?
>>
>>Attached is a patch with a couple of things I've currently got.
>>
>
>will merge with mine and post a new version shortly.
>

Thanks.

>>In the long run, I'd like to be able to set_page_dirty and
>>balance_dirty_pages outside of both ptl and mmap_sem, for performance
>>reasons. That will require a reworking of arch code though :(
>>
>
>That would indeed be very nice if possible.
>

Yep. Let's not distract from getting the basic mechanism working though.
balance_dirty_pages would be patch 2..n ;)

>>Index: linux-2.6/include/linux/mm.h
>>===================================================================
>>--- linux-2.6.orig/include/linux/mm.h	2006-05-06 23:05:10.000000000 +1000
>>+++ linux-2.6/include/linux/mm.h	2006-05-06 23:06:17.000000000 +1000
>>@@ -183,8 +183,7 @@ extern unsigned int kobjsize(const void 
>> #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
>> #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
>> 
>>-#define VM_SharedWritable(v)		(((v)->vm_flags & (VM_SHARED | VM_MAYSHARE)) && \
>>-					 ((v)->vm_flags & VM_WRITE))
>>+#define VM_SharedWritable(v)		((v)->vm_flags & (VM_SHARED|VM_WRITE))
>> 
>> /*
>>  * mapping from the currently active vm_flags protection bits (the
>>
>
>That doesn't look right, your version is true even for unwritable
>shared, and writable non-shared VMAs.
>

Of course, thanks. I guess that should be

#define VM_SharedWritable(v) ((v)->vm_flags & (VM_SHARED|VM_WRITE) == (VM_SHARED|VM_WRITE))

BTW. It is unconventional (outside the read hints stuff) to use macros like
this. I guess real VM hackers have to know what is intended by any given esoteric
combination of flags in any given context.

Not that I hate it.

But if we're going to start using it, we should work out a sane convention and
stick to it. "StudlyCaps" seem to be out of favour, and using a vma_ prefix would
be more sensible.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-06 13:47     ` Nick Piggin
@ 2006-05-06 15:29       ` Peter Zijlstra
  2006-05-07  0:40         ` Nick Piggin
  2006-05-08  6:43         ` Christoph Lameter
  0 siblings, 2 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-06 15:29 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linus Torvalds, Andi Kleen, Rohit Seth, Andrew Morton, clameter,
	mbligh, hugh, riel, andrea, arjan, apw, mel, marcelo, anton,
	paulmck, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1087 bytes --]

On Sat, 2006-05-06 at 23:47 +1000, Nick Piggin wrote:

> Yep. Let's not distract from getting the basic mechanism working though.
> balance_dirty_pages would be patch 2..n ;)

Attached are both a new version of the shared_mapping_dirty patch, and
balance_dirty_pages; to be applied in that order. 

It makes my testcase survive and not OOM like it used to.

> BTW. It is unconventional (outside the read hints stuff) to use macros like
> this. I guess real VM hackers have to know what is intended by any given esoteric
> combination of flags in any given context.
> 
> Not that I hate it.
> 
> But if we're going to start using it, we should work out a sane convention and
> stick to it. "StudlyCaps" seem to be out of favour, and using a vma_ prefix would
> be more sensible.

Not a real fan of "StudlyCaps" myself either, just adapting to whatever
was there.

This macro was born because I find writing the same thing more than
twice a nuisance and errorprone. However if ppl feel otherwise I'm fine
with either writing it out explicitly or renaming the thing,
suggestions?

PeterZ



[-- Attachment #2: balance_dirty_pages.patch --]
[-- Type: text/x-patch, Size: 2007 bytes --]

 mm/memory.c |   18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-06 16:51:01.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-06 17:15:16.000000000 +0200
@@ -50,6 +50,7 @@
 #include <linux/init.h>
 #include <linux/mm_page_replace.h>
 #include <linux/backing-dev.h>
+#include <linux/writeback.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2078,6 +2079,7 @@ static int do_no_page(struct mm_struct *
 	unsigned int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
+	int dirty = 0;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2165,8 +2167,10 @@ retry:
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
-			if (write_access)
+			if (write_access) {
 				set_page_dirty(new_page);
+				dirty++;
+			}
 		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
@@ -2179,6 +2183,8 @@ retry:
 	lazy_mmu_prot_update(entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (dirty)
+		balance_dirty_pages_ratelimited_nr(mapping, dirty);
 	return ret;
 oom:
 	page_cache_release(new_page);
@@ -2243,6 +2249,8 @@ static inline int handle_pte_fault(struc
 	pte_t entry;
 	pte_t old_entry;
 	spinlock_t *ptl;
+	struct address_space *mapping;
+	int dirty = 0;
 
 	old_entry = entry = *pte;
 	if (!pte_present(entry)) {
@@ -2273,8 +2281,11 @@ static inline int handle_pte_fault(struc
 				struct page *page;
 				entry = pte_mkwrite(entry);
 				page = vm_normal_page(vma, address, entry);
-				if (page)
+				if (page) {
 					set_page_dirty(page);
+					mapping = page_mapping(page);
+					dirty++;
+				}
 			}
 		}
 		entry = pte_mkdirty(entry);
@@ -2297,6 +2308,9 @@ static inline int handle_pte_fault(struc
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
+	if (dirty && mapping)
+		balance_dirty_pages_ratelimited_nr(mapping, dirty);
+
 	return VM_FAULT_MINOR;
 }
 

[-- Attachment #3: shared_mapping_dirty.patch --]
[-- Type: text/x-patch, Size: 8322 bytes --]


From: Peter Zijlstra <a.p.zijlstra@chello.nl>

People expressed the need to track dirty pages in shared mappings.

Linus outlined the general idea of doing that through making clean
writable pages write-protected and taking the write fault.

This patch does exactly that, it makes pages in a shared writable
mapping write-protected. On write-fault the pages are marked dirty and
made writable. When the pages get synced with their backing store, the
write-protection is re-instated.

It survives a simple test and shows the dirty pages in /proc/vmstat.

Changes in -v2:

 - only wrprotect pages from dirty capable mappings. (Nick Piggin)
 - move the writefault handling from do_wp_page() into handle_pte_fault(). 
   (Nick Piggin)
 - revert to the old install_page interface. (Nick Piggin)
 - also clear the pte dirty bit when we make pages read-only again.
   (spotted by Rik van Riel)
 - make page_wrprotect() return the number of reprotected ptes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

 include/linux/mm.h   |    2 +
 include/linux/rmap.h |    6 ++++
 mm/fremap.c          |   10 ++++++-
 mm/memory.c          |   24 ++++++++++++++++--
 mm/page-writeback.c  |    9 +++++-
 mm/rmap.c            |   66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 110 insertions(+), 7 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/include/linux/mm.h	2006-05-06 16:51:01.000000000 +0200
@@ -183,6 +183,8 @@ extern unsigned int kobjsize(const void 
 #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
+#define VM_SharedWritable(v) (((v)->vm_flags & (VM_SHARED|VM_WRITE)) == (VM_SHARED|VM_WRITE))
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/fremap.c	2006-05-06 16:51:01.000000000 +0200
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -79,9 +80,14 @@ int install_page(struct mm_struct *mm, s
 		inc_mm_counter(mm, file_rss);
 
 	flush_icache_page(vma, page);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	pte_val = mk_pte(page, prot);
+	if (VM_SharedWritable(vma)) {
+		struct address_space *mapping = page_mapping(page);
+		if (mapping && mapping_cap_account_dirty(mapping))
+			pte_val = pte_wrprotect(pte_val);
+	}
+	set_pte_at(mm, addr, pte, pte_val);
 	page_add_file_rmap(page);
-	pte_val = *pte;
 	update_mmu_cache(vma, addr, pte_val);
 	err = 0;
 unlock:
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-06 17:20:57.000000000 +0200
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/mm_page_replace.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2150,6 +2151,11 @@ retry:
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		else if (VM_SharedWritable(vma)) {
+			struct address_space *mapping = page_mapping(new_page);
+			if (mapping && mapping_cap_account_dirty(mapping))
+				entry = pte_wrprotect(entry);
+		}
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
@@ -2159,6 +2165,8 @@ retry:
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+			if (write_access)
+				set_page_dirty(new_page);
 		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
@@ -2257,12 +2265,22 @@ static inline int handle_pte_fault(struc
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (write_access) {
-		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+		if (!pte_write(entry)) {
+			if (!VM_SharedWritable(vma)) {
+				return do_wp_page(mm, vma, address,
+						pte, pmd, ptl, entry);
+			} else {
+				struct page *page;
+				entry = pte_mkwrite(entry);
+				page = vm_normal_page(vma, address, entry);
+				if (page)
+					set_page_dirty(page);
+			}
+		}
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
+
 	if (!pte_same(old_entry, entry)) {
 		ptep_set_access_flags(vma, address, pte, entry, write_access);
 		update_mmu_cache(vma, address, entry);
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2006-05-06 16:51:01.000000000 +0200
@@ -29,6 +29,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/rmap.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -725,8 +726,10 @@ int test_clear_page_dirty(struct page *p
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -756,8 +759,10 @@ int clear_page_dirty_for_io(struct page 
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		return 0;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/rmap.c	2006-05-06 16:51:01.000000000 +0200
@@ -478,6 +478,72 @@ int page_referenced(struct page *page, i
 	return referenced;
 }
 
+static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte, entry;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	pte = page_check_address(page, mm, address, &ptl);
+	if (!pte)
+		goto out;
+
+	if (!pte_write(*pte))
+		goto unlock;
+
+	entry = pte_mkclean(pte_wrprotect(*pte));
+	ptep_establish(vma, address, pte, entry);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+	ret = 1;
+
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return ret;
+}
+
+static int page_wrprotect_file(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int ret = 0;
+
+	BUG_ON(PageAnon(page));
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if (VM_SharedWritable(vma))
+			ret += page_wrprotect_one(page, vma);
+	}
+
+	spin_unlock(&mapping->i_mmap_lock);
+	return ret;
+}
+
+int page_wrprotect(struct page *page)
+{
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	if (page_mapped(page) && page->mapping) {
+		if (!PageAnon(page))
+			ret = page_wrprotect_file(page);
+	}
+
+	return ret;
+}
+
 /**
  * page_set_anon_rmap - setup new anonymous rmap
  * @page:	the page to add the mapping to
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/include/linux/rmap.h	2006-05-06 16:51:01.000000000 +0200
@@ -105,6 +105,12 @@ pte_t *page_check_address(struct page *,
  */
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
+/*
+ * Used to writeprotect clean pages, in order to count nr_dirty for shared
+ * mappings
+ */
+int page_wrprotect(struct page *);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-06 15:29       ` Peter Zijlstra
@ 2006-05-07  0:40         ` Nick Piggin
  2006-05-07  3:43           ` Nick Piggin
  2006-05-08  6:43         ` Christoph Lameter
  1 sibling, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-07  0:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Andi Kleen, Rohit Seth, Andrew Morton, clameter,
	mbligh, hugh, riel, andrea, arjan, apw, mel, marcelo, anton,
	paulmck, linux-mm

Peter Zijlstra wrote:

>On Sat, 2006-05-06 at 23:47 +1000, Nick Piggin wrote:
>
>
>>Yep. Let's not distract from getting the basic mechanism working though.
>>balance_dirty_pages would be patch 2..n ;)
>>
>
>Attached are both a new version of the shared_mapping_dirty patch, and
>balance_dirty_pages; to be applied in that order. 
>
>It makes my testcase survive and not OOM like it used to.
>

Looks OK. I wonder if test_clear_page_dirty could skip the page_wrprotect
entirely? It would speed up cases like truncate that don't care. OTOH, it
looks like several filesystems do not use clear_page_dirty_for_io where
they possibly should be...

Perhaps you could consolidate both checks into test_set_page_writeback()?

Andrew, any ideas?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-07  0:40         ` Nick Piggin
@ 2006-05-07  3:43           ` Nick Piggin
  0 siblings, 0 replies; 43+ messages in thread
From: Nick Piggin @ 2006-05-07  3:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, Andi Kleen, Rohit Seth, Andrew Morton, clameter,
	mbligh, hugh, riel, andrea, arjan, apw, mel, marcelo, anton,
	paulmck, linux-mm

Nick Piggin wrote:
> Peter Zijlstra wrote:
> 
>> On Sat, 2006-05-06 at 23:47 +1000, Nick Piggin wrote:
>>
>>
>>> Yep. Let's not distract from getting the basic mechanism working though.
>>> balance_dirty_pages would be patch 2..n ;)
>>>
>>
>> Attached are both a new version of the shared_mapping_dirty patch, and
>> balance_dirty_pages; to be applied in that order.
>> It makes my testcase survive and not OOM like it used to.
>>
> 
> Looks OK. I wonder if test_clear_page_dirty could skip the page_wrprotect
> entirely? It would speed up cases like truncate that don't care. OTOH, it
> looks like several filesystems do not use clear_page_dirty_for_io where
> they possibly should be...
> 
> Perhaps you could consolidate both checks into test_set_page_writeback()?

Actually, you'll skip the slow path of taking the lock most of the
times in situations like truncate because it will bail out after the
page_mapped check. So doing anything further is probably a premature
optimisation at this stage.

Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-06 15:29       ` Peter Zijlstra
  2006-05-07  0:40         ` Nick Piggin
@ 2006-05-08  6:43         ` Christoph Lameter
  2006-05-08  7:23           ` Peter Zijlstra
                             ` (2 more replies)
  1 sibling, 3 replies; 43+ messages in thread
From: Christoph Lameter @ 2006-05-08  6:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

On Sat, 6 May 2006, Peter Zijlstra wrote:

> Attached are both a new version of the shared_mapping_dirty patch, and
> balance_dirty_pages; to be applied in that order. 

Would you please sent patches inline? It seems that you need to initialize 
mapping to NULL in handle_pte_fault.

You could defer the page dirtying by taking a reference on the page then 
dropping the pte lock dirty the page and then drop the reference. That way 
dirtying is running without the pte lock.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH] tracking dirty pages in shared mappings
  2006-05-08  6:43         ` Christoph Lameter
@ 2006-05-08  7:23           ` Peter Zijlstra
  2006-05-08 19:20           ` [RFC][PATCH 1/2] tracking dirty pages in shared mappings -V3 Peter Zijlstra
  2006-05-08 19:24           ` [RFC][PATCH 2/2] throttle writers of shared mappings Peter Zijlstra
  2 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-08  7:23 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

On Sun, 2006-05-07 at 23:43 -0700, Christoph Lameter wrote:
> On Sat, 6 May 2006, Peter Zijlstra wrote:
> 
> > Attached are both a new version of the shared_mapping_dirty patch, and
> > balance_dirty_pages; to be applied in that order. 
> 
> Would you please sent patches inline? 

Ofcourse, will do next round; in the meantime you can get the patches
from:
http://programming.kicks-ass.net/kernel-patches/shared-dirty/

> It seems that you need to initialize 
> mapping to NULL in handle_pte_fault.

Almost, the dirty count avoids that. I should just init mapping to NULL
and drop the dirty count; there is no way that can ever be more than 1
anyway.

> You could defer the page dirtying by taking a reference on the page then 
> dropping the pte lock dirty the page and then drop the reference. That way 
> dirtying is running without the pte lock.

Right, very good, will do that when I get home.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [RFC][PATCH 1/2] tracking dirty pages in shared mappings -V3
  2006-05-08  6:43         ` Christoph Lameter
  2006-05-08  7:23           ` Peter Zijlstra
@ 2006-05-08 19:20           ` Peter Zijlstra
  2006-05-09  5:41             ` Christoph Lameter
  2006-05-08 19:24           ` [RFC][PATCH 2/2] throttle writers of shared mappings Peter Zijlstra
  2 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-08 19:20 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

People expressed the need to track dirty pages in shared mappings.

Linus outlined the general idea of doing that through making clean
writable pages write-protected and taking the write fault.

This patch does exactly that, it makes pages in a shared writable
mapping write-protected. On write-fault the pages are marked dirty and
made writable. When the pages get synced with their backing store, the
write-protection is re-instated.

It survives a simple test and shows the dirty pages in /proc/vmstat.

Changes in -v3:

 - move set_page_dirty() outside pte lock (suggested by Christoph Lameter)

Changes in -v2:

 - only wrprotect pages from dirty capable mappings. (Nick Piggin)
 - move the writefault handling from do_wp_page() into handle_pte_fault(). 
   (Nick Piggin)
 - revert to the old install_page interface. (Nick Piggin)
 - also clear the pte dirty bit when we make pages read-only again.
   (spotted by Rik van Riel)
 - make page_wrprotect() return the number of reprotected ptes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---

 include/linux/mm.h   |    2 +
 include/linux/rmap.h |    6 ++++
 mm/fremap.c          |   10 ++++++-
 mm/memory.c          |   34 +++++++++++++++++++++++---
 mm/page-writeback.c  |    9 +++++-
 mm/rmap.c            |   66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 120 insertions(+), 7 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/include/linux/mm.h	2006-05-06 16:51:01.000000000 +0200
@@ -183,6 +183,8 @@ extern unsigned int kobjsize(const void 
 #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
+#define VM_SharedWritable(v) (((v)->vm_flags & (VM_SHARED|VM_WRITE)) == (VM_SHARED|VM_WRITE))
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/fremap.c	2006-05-06 16:51:01.000000000 +0200
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -79,9 +80,14 @@ int install_page(struct mm_struct *mm, s
 		inc_mm_counter(mm, file_rss);
 
 	flush_icache_page(vma, page);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	pte_val = mk_pte(page, prot);
+	if (VM_SharedWritable(vma)) {
+		struct address_space *mapping = page_mapping(page);
+		if (mapping && mapping_cap_account_dirty(mapping))
+			pte_val = pte_wrprotect(pte_val);
+	}
+	set_pte_at(mm, addr, pte, pte_val);
 	page_add_file_rmap(page);
-	pte_val = *pte;
 	update_mmu_cache(vma, addr, pte_val);
 	err = 0;
 unlock:
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-08 18:20:49.000000000 +0200
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/mm_page_replace.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2077,6 +2078,7 @@ static int do_no_page(struct mm_struct *
 	unsigned int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
+	int dirty = 0;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2150,6 +2152,11 @@ retry:
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		else if (VM_SharedWritable(vma)) {
+			struct address_space *mapping = page_mapping(new_page);
+			if (mapping && mapping_cap_account_dirty(mapping))
+				entry = pte_wrprotect(entry);
+		}
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
@@ -2159,6 +2166,10 @@ retry:
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+			if (write_access) {
+				get_page(new_page);
+				dirty++;
+			}
 		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
@@ -2171,6 +2182,10 @@ retry:
 	lazy_mmu_prot_update(entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (dirty) {
+		set_page_dirty(new_page);
+		put_page(new_page);
+	}
 	return ret;
 oom:
 	page_cache_release(new_page);
@@ -2235,6 +2250,7 @@ static inline int handle_pte_fault(struc
 	pte_t entry;
 	pte_t old_entry;
 	spinlock_t *ptl;
+	struct page *page = NULL;
 
 	old_entry = entry = *pte;
 	if (!pte_present(entry)) {
@@ -2257,12 +2273,20 @@ static inline int handle_pte_fault(struc
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (write_access) {
-		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+		if (!pte_write(entry)) {
+			if (!VM_SharedWritable(vma)) {
+				return do_wp_page(mm, vma, address,
+						pte, pmd, ptl, entry);
+			} else {
+				entry = pte_mkwrite(entry);
+				page = vm_normal_page(vma, address, entry);
+				get_page(page);
+			}
+		}
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
+
 	if (!pte_same(old_entry, entry)) {
 		ptep_set_access_flags(vma, address, pte, entry, write_access);
 		update_mmu_cache(vma, address, entry);
@@ -2279,6 +2303,10 @@ static inline int handle_pte_fault(struc
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
+	if (page) {
+		set_page_dirty(page);
+		put_page(page);
+	}
 	return VM_FAULT_MINOR;
 }
 
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2006-05-06 16:51:01.000000000 +0200
@@ -29,6 +29,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/rmap.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -725,8 +726,10 @@ int test_clear_page_dirty(struct page *p
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -756,8 +759,10 @@ int clear_page_dirty_for_io(struct page 
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		return 0;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/mm/rmap.c	2006-05-06 16:51:01.000000000 +0200
@@ -478,6 +478,72 @@ int page_referenced(struct page *page, i
 	return referenced;
 }
 
+static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte, entry;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	pte = page_check_address(page, mm, address, &ptl);
+	if (!pte)
+		goto out;
+
+	if (!pte_write(*pte))
+		goto unlock;
+
+	entry = pte_mkclean(pte_wrprotect(*pte));
+	ptep_establish(vma, address, pte, entry);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+	ret = 1;
+
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return ret;
+}
+
+static int page_wrprotect_file(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int ret = 0;
+
+	BUG_ON(PageAnon(page));
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if (VM_SharedWritable(vma))
+			ret += page_wrprotect_one(page, vma);
+	}
+
+	spin_unlock(&mapping->i_mmap_lock);
+	return ret;
+}
+
+int page_wrprotect(struct page *page)
+{
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	if (page_mapped(page) && page->mapping) {
+		if (!PageAnon(page))
+			ret = page_wrprotect_file(page);
+	}
+
+	return ret;
+}
+
 /**
  * page_set_anon_rmap - setup new anonymous rmap
  * @page:	the page to add the mapping to
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h	2006-05-06 16:45:24.000000000 +0200
+++ linux-2.6/include/linux/rmap.h	2006-05-06 16:51:01.000000000 +0200
@@ -105,6 +105,12 @@ pte_t *page_check_address(struct page *,
  */
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
+/*
+ * Used to writeprotect clean pages, in order to count nr_dirty for shared
+ * mappings
+ */
+int page_wrprotect(struct page *);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [RFC][PATCH 2/2] throttle writers of shared mappings
  2006-05-08  6:43         ` Christoph Lameter
  2006-05-08  7:23           ` Peter Zijlstra
  2006-05-08 19:20           ` [RFC][PATCH 1/2] tracking dirty pages in shared mappings -V3 Peter Zijlstra
@ 2006-05-08 19:24           ` Peter Zijlstra
  2 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-08 19:24 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Now that we can detect writers of shared mappings, throttle them.
Avoids OOM by surprise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---

 mm/memory.c |    7 +++++++
 1 file changed, 7 insertions(+)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-08 18:20:49.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-08 18:35:26.000000000 +0200
@@ -50,6 +50,7 @@
 #include <linux/init.h>
 #include <linux/mm_page_replace.h>
 #include <linux/backing-dev.h>
+#include <linux/writeback.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2183,8 +2184,11 @@ retry:
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty) {
+		struct address_space *mapping = page_mapping(new_page);
 		set_page_dirty(new_page);
 		put_page(new_page);
+		if (mapping)
+			balance_dirty_pages_ratelimited_nr(mapping, 1);
 	}
 	return ret;
 oom:
@@ -2304,8 +2308,11 @@ static inline int handle_pte_fault(struc
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	if (page) {
+		struct address_space *mapping = page_mapping(page);
 		set_page_dirty(page);
 		put_page(page);
+		if (mapping)
+			balance_dirty_pages_ratelimited_nr(mapping, 1);
 	}
 	return VM_FAULT_MINOR;
 }


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/2] tracking dirty pages in shared mappings -V3
  2006-05-08 19:20           ` [RFC][PATCH 1/2] tracking dirty pages in shared mappings -V3 Peter Zijlstra
@ 2006-05-09  5:41             ` Christoph Lameter
  2006-05-09  6:06               ` Peter Zijlstra
                                 ` (3 more replies)
  0 siblings, 4 replies; 43+ messages in thread
From: Christoph Lameter @ 2006-05-09  5:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

On Mon, 8 May 2006, Peter Zijlstra wrote:

> @@ -2077,6 +2078,7 @@ static int do_no_page(struct mm_struct *
>  	unsigned int sequence = 0;
>  	int ret = VM_FAULT_MINOR;
>  	int anon = 0;
> +	int dirty = 0;
	dirtied_page = NULL ?

> @@ -2150,6 +2152,11 @@ retry:
>  		entry = mk_pte(new_page, vma->vm_page_prot);
>  		if (write_access)
>  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);

A write fault to a shared mapping does not make the page dirty, just the 
pte?

>  			inc_mm_counter(mm, file_rss);
>  			page_add_file_rmap(new_page);
> +			if (write_access) {
> +				get_page(new_page);
> +				dirty++;
				dirtied_page = new_page; ?
				get_page(dirtied_page); ?

> +	if (dirty) {
> +		set_page_dirty(new_page);
> +		put_page(new_page);
> +	}

if (dirtied_page)
		set_page_dirty(dirtied_page);
		put_page(dirtied_page)
?

> @@ -2235,6 +2250,7 @@ static inline int handle_pte_fault(struc
>  	pte_t entry;
>  	pte_t old_entry;
>  	spinlock_t *ptl;
> +	struct page *page = NULL;
use dirtied_page instead to make it the same as the other function?

> +int page_wrprotect(struct page *page)

The above and related functions look similar to code in 
rmap.c and migrate.c. Could those be consolidated?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/2] tracking dirty pages in shared mappings -V3
  2006-05-09  5:41             ` Christoph Lameter
@ 2006-05-09  6:06               ` Peter Zijlstra
  2006-05-09 20:44               ` [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4 Peter Zijlstra
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-09  6:06 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

On Mon, 2006-05-08 at 22:41 -0700, Christoph Lameter wrote:
> On Mon, 8 May 2006, Peter Zijlstra wrote:
> 
> > @@ -2077,6 +2078,7 @@ static int do_no_page(struct mm_struct *
> >  	unsigned int sequence = 0;
> >  	int ret = VM_FAULT_MINOR;
> >  	int anon = 0;
> > +	int dirty = 0;
> 	dirtied_page = NULL ?

Much nicer indeed!

> > @@ -2150,6 +2152,11 @@ retry:
> >  		entry = mk_pte(new_page, vma->vm_page_prot);
> >  		if (write_access)
> >  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> 
> A write fault to a shared mapping does not make the page dirty, just the 
> pte?

We do that here:

> >  			inc_mm_counter(mm, file_rss);
> >  			page_add_file_rmap(new_page);
> > +			if (write_access) {
> > +				get_page(new_page);
> > +				dirty++;



> > +int page_wrprotect(struct page *page)
> 
> The above and related functions look similar to code in 
> rmap.c and migrate.c. Could those be consolidated?

I'll have a look.

Peter

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-09  5:41             ` Christoph Lameter
  2006-05-09  6:06               ` Peter Zijlstra
@ 2006-05-09 20:44               ` Peter Zijlstra
  2006-05-09 20:52                 ` Peter Chubb
  2006-05-11 15:02                 ` Andrew Morton
  2006-05-09 20:44               ` [RFC][PATCH 2/3] throttle writers of shared mappings Peter Zijlstra
  2006-05-09 20:44               ` [RFC][PATCH 3/3] optimize follow_pages() Peter Zijlstra
  3 siblings, 2 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-09 20:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

People expressed the need to track dirty pages in shared mappings.

Linus outlined the general idea of doing that through making clean
writable pages write-protected and taking the write fault.

This patch does exactly that, it makes pages in a shared writable
mapping write-protected. On write-fault the pages are marked dirty and
made writable. When the pages get synced with their backing store, the
write-protection is re-instated.

It survives a simple test and shows the dirty pages in /proc/vmstat.

Changes in -v4:

 - small cleanup as suggested by Christoph Lameter.

Changes in -v3:

 - move set_page_dirty() outside pte lock (suggested by Christoph Lameter)

Changes in -v2:

 - only wrprotect pages from dirty capable mappings. (Nick Piggin)
 - move the writefault handling from do_wp_page() into handle_pte_fault(). 
   (Nick Piggin)
 - revert to the old install_page interface. (Nick Piggin)
 - also clear the pte dirty bit when we make pages read-only again.
   (spotted by Rik van Riel)
 - make page_wrprotect() return the number of reprotected ptes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---

 include/linux/mm.h   |    2 +
 include/linux/rmap.h |    6 ++++
 mm/fremap.c          |   10 ++++++-
 mm/memory.c          |   34 +++++++++++++++++++++++---
 mm/page-writeback.c  |    9 +++++-
 mm/rmap.c            |   66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 120 insertions(+), 7 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-08 18:49:39.000000000 +0200
+++ linux-2.6/include/linux/mm.h	2006-05-08 18:53:34.000000000 +0200
@@ -183,6 +183,8 @@ extern unsigned int kobjsize(const void 
 #define VM_SequentialReadHint(v)	((v)->vm_flags & VM_SEQ_READ)
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
+#define VM_SharedWritable(v) (((v)->vm_flags & (VM_SHARED|VM_WRITE)) == (VM_SHARED|VM_WRITE))
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2006-05-08 18:49:39.000000000 +0200
+++ linux-2.6/mm/fremap.c	2006-05-08 18:53:34.000000000 +0200
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/backing-dev.h>
 
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -79,9 +80,14 @@ int install_page(struct mm_struct *mm, s
 		inc_mm_counter(mm, file_rss);
 
 	flush_icache_page(vma, page);
-	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+	pte_val = mk_pte(page, prot);
+	if (VM_SharedWritable(vma)) {
+		struct address_space *mapping = page_mapping(page);
+		if (mapping && mapping_cap_account_dirty(mapping))
+			pte_val = pte_wrprotect(pte_val);
+	}
+	set_pte_at(mm, addr, pte, pte_val);
 	page_add_file_rmap(page);
-	pte_val = *pte;
 	update_mmu_cache(vma, addr, pte_val);
 	err = 0;
 unlock:
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-08 18:49:39.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-09 09:15:11.000000000 +0200
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/mm_page_replace.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2077,6 +2078,7 @@ static int do_no_page(struct mm_struct *
 	unsigned int sequence = 0;
 	int ret = VM_FAULT_MINOR;
 	int anon = 0;
+	struct page *dirty_page = NULL;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
@@ -2150,6 +2152,11 @@ retry:
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		else if (VM_SharedWritable(vma)) {
+			struct address_space *mapping = page_mapping(new_page);
+			if (mapping && mapping_cap_account_dirty(mapping))
+				entry = pte_wrprotect(entry);
+		}
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
 			inc_mm_counter(mm, anon_rss);
@@ -2159,6 +2166,10 @@ retry:
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+			if (write_access) {
+				dirty_page = new_page;
+				get_page(dirty_page);
+			}
 		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
@@ -2171,6 +2182,10 @@ retry:
 	lazy_mmu_prot_update(entry);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (dirty_page) {
+		set_page_dirty(dirty_page);
+		put_page(dirty_page);
+	}
 	return ret;
 oom:
 	page_cache_release(new_page);
@@ -2235,6 +2250,7 @@ static inline int handle_pte_fault(struc
 	pte_t entry;
 	pte_t old_entry;
 	spinlock_t *ptl;
+	struct page *dirty_page = NULL;
 
 	old_entry = entry = *pte;
 	if (!pte_present(entry)) {
@@ -2257,12 +2273,20 @@ static inline int handle_pte_fault(struc
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (write_access) {
-		if (!pte_write(entry))
-			return do_wp_page(mm, vma, address,
-					pte, pmd, ptl, entry);
+		if (!pte_write(entry)) {
+			if (!VM_SharedWritable(vma)) {
+				return do_wp_page(mm, vma, address,
+						pte, pmd, ptl, entry);
+			} else {
+				entry = pte_mkwrite(entry);
+				dirty_page = vm_normal_page(vma, address, entry);
+				get_page(dirty_page);
+			}
+		}
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
+
 	if (!pte_same(old_entry, entry)) {
 		ptep_set_access_flags(vma, address, pte, entry, write_access);
 		update_mmu_cache(vma, address, entry);
@@ -2279,6 +2303,10 @@ static inline int handle_pte_fault(struc
 	}
 unlock:
 	pte_unmap_unlock(pte, ptl);
+	if (dirty_page) {
+		set_page_dirty(dirty_page);
+		put_page(dirty_page);
+	}
 	return VM_FAULT_MINOR;
 }
 
Index: linux-2.6/mm/page-writeback.c
===================================================================
--- linux-2.6.orig/mm/page-writeback.c	2006-05-08 18:49:39.000000000 +0200
+++ linux-2.6/mm/page-writeback.c	2006-05-08 18:53:34.000000000 +0200
@@ -29,6 +29,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/rmap.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -725,8 +726,10 @@ int test_clear_page_dirty(struct page *p
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -756,8 +759,10 @@ int clear_page_dirty_for_io(struct page 
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
-			if (mapping_cap_account_dirty(mapping))
+			if (mapping_cap_account_dirty(mapping)) {
+				page_wrprotect(page);
 				dec_page_state(nr_dirty);
+			}
 			return 1;
 		}
 		return 0;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2006-05-08 18:49:39.000000000 +0200
+++ linux-2.6/mm/rmap.c	2006-05-08 18:53:34.000000000 +0200
@@ -478,6 +478,72 @@ int page_referenced(struct page *page, i
 	return referenced;
 }
 
+static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t *pte, entry;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	address = vma_address(page, vma);
+	if (address == -EFAULT)
+		goto out;
+
+	pte = page_check_address(page, mm, address, &ptl);
+	if (!pte)
+		goto out;
+
+	if (!pte_write(*pte))
+		goto unlock;
+
+	entry = pte_mkclean(pte_wrprotect(*pte));
+	ptep_establish(vma, address, pte, entry);
+	update_mmu_cache(vma, address, entry);
+	lazy_mmu_prot_update(entry);
+	ret = 1;
+
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return ret;
+}
+
+static int page_wrprotect_file(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+	int ret = 0;
+
+	BUG_ON(PageAnon(page));
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		if (VM_SharedWritable(vma))
+			ret += page_wrprotect_one(page, vma);
+	}
+
+	spin_unlock(&mapping->i_mmap_lock);
+	return ret;
+}
+
+int page_wrprotect(struct page *page)
+{
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	if (page_mapped(page) && page->mapping) {
+		if (!PageAnon(page))
+			ret = page_wrprotect_file(page);
+	}
+
+	return ret;
+}
+
 /**
  * page_set_anon_rmap - setup new anonymous rmap
  * @page:	the page to add the mapping to
Index: linux-2.6/include/linux/rmap.h
===================================================================
--- linux-2.6.orig/include/linux/rmap.h	2006-05-08 18:49:39.000000000 +0200
+++ linux-2.6/include/linux/rmap.h	2006-05-08 18:53:34.000000000 +0200
@@ -105,6 +105,12 @@ pte_t *page_check_address(struct page *,
  */
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
+/*
+ * Used to writeprotect clean pages, in order to count nr_dirty for shared
+ * mappings
+ */
+int page_wrprotect(struct page *);
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [RFC][PATCH 2/3] throttle writers of shared mappings
  2006-05-09  5:41             ` Christoph Lameter
  2006-05-09  6:06               ` Peter Zijlstra
  2006-05-09 20:44               ` [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4 Peter Zijlstra
@ 2006-05-09 20:44               ` Peter Zijlstra
  2006-05-09 22:54                 ` Nick Piggin
  2006-05-09 20:44               ` [RFC][PATCH 3/3] optimize follow_pages() Peter Zijlstra
  3 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-09 20:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Now that we can detect writers of shared mappings, throttle them.
Avoids OOM by surprise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---

 mm/memory.c |    7 +++++++
 1 file changed, 7 insertions(+)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-09 09:15:11.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-09 09:17:12.000000000 +0200
@@ -50,6 +50,7 @@
 #include <linux/init.h>
 #include <linux/mm_page_replace.h>
 #include <linux/backing-dev.h>
+#include <linux/writeback.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2183,8 +2184,11 @@ retry:
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty_page) {
+		struct address_space *mapping = page_mapping(dirty_page);
 		set_page_dirty(dirty_page);
 		put_page(dirty_page);
+		if (mapping)
+			balance_dirty_pages_ratelimited_nr(mapping, 1);
 	}
 	return ret;
 oom:
@@ -2304,8 +2308,11 @@ static inline int handle_pte_fault(struc
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	if (dirty_page) {
+		struct address_space *mapping = page_mapping(dirty_page);
 		set_page_dirty(dirty_page);
 		put_page(dirty_page);
+		if (mapping)
+			balance_dirty_pages_ratelimited_nr(mapping, 1);
 	}
 	return VM_FAULT_MINOR;
 }


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [RFC][PATCH 3/3] optimize follow_pages()
  2006-05-09  5:41             ` Christoph Lameter
                                 ` (2 preceding siblings ...)
  2006-05-09 20:44               ` [RFC][PATCH 2/3] throttle writers of shared mappings Peter Zijlstra
@ 2006-05-09 20:44               ` Peter Zijlstra
  2006-05-10  6:30                 ` Peter Zijlstra
  3 siblings, 1 reply; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-09 20:44 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Christoph Lameter suggested I pull set_page_dirty() out from under the 
pte lock.

I reviewed the current calls and found the one in follow_page() a candidate
for the same treatment.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---

 include/linux/mm.h |    1 +
 mm/memory.c        |   12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-08 18:53:34.000000000 +0200
+++ linux-2.6/include/linux/mm.h	2006-05-09 17:48:36.000000000 +0200
@@ -1015,6 +1015,7 @@ struct page *follow_page(struct vm_area_
 #define FOLL_TOUCH	0x02	/* mark page accessed */
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
+#define FOLL_DIRTY	0x10	/* the page was dirtied */
 
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-09 09:17:12.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-09 17:52:02.000000000 +0200
@@ -962,16 +962,22 @@ struct page *follow_page(struct vm_area_
 	if (unlikely(!page))
 		goto unlock;
 
-	if (flags & FOLL_GET)
+	if (flags & (FOLL_GET | FOLL_TOUCH))
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
-		mark_page_accessed(page);
+			flags |= FOLL_DIRTY;
 	}
 unlock:
 	pte_unmap_unlock(ptep, ptl);
+	if (flags & FOLL_TOUCH) {
+		if (flags & FOLL_DIRTY)
+			set_page_dirty(page);
+		mark_page_accessed(page);
+	}
+	if (!(flags & FOLL_GET))
+		put_page(page);
 out:
 	return page;
 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-09 20:44               ` [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4 Peter Zijlstra
@ 2006-05-09 20:52                 ` Peter Chubb
  2006-05-09 20:55                   ` Martin Bligh
  2006-05-11 15:02                 ` Andrew Morton
  1 sibling, 1 reply; 43+ messages in thread
From: Peter Chubb @ 2006-05-09 20:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Nick Piggin, Linus Torvalds, Andi Kleen,
	Rohit Seth, Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw,
	mel, marcelo, anton, paulmck, linux-mm

>>>>> "Peter" == Peter Zijlstra <a.p.zijlstra@chello.nl> writes:

Peter> From: Peter Zijlstra <a.p.zijlstra@chello.nl> People expressed
Peter> the need to track dirty pages in shared mappings.

Peter> Linus outlined the general idea of doing that through making
Peter> clean writable pages write-protected and taking the write
Peter> fault.

What does this do to performance on TPC workloads?  How many extra
faults are there likely to be?

--
Dr Peter Chubb  http://www.gelato.unsw.edu.au  peterc AT gelato.unsw.edu.au
http://www.ertos.nicta.com.au           ERTOS within National ICT Australia

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-09 20:52                 ` Peter Chubb
@ 2006-05-09 20:55                   ` Martin Bligh
  2006-05-09 22:56                     ` Brian Twichell
  2006-05-10  0:24                     ` Linus Torvalds
  0 siblings, 2 replies; 43+ messages in thread
From: Martin Bligh @ 2006-05-09 20:55 UTC (permalink / raw)
  To: Peter Chubb
  Cc: Peter Zijlstra, Christoph Lameter, Nick Piggin, Linus Torvalds,
	Andi Kleen, Rohit Seth, Andrew Morton, hugh, riel, andrea, arjan,
	apw, mel, marcelo, anton, paulmck, linux-mm

Peter Chubb wrote:
>>>>>>"Peter" == Peter Zijlstra <a.p.zijlstra@chello.nl> writes:
> 
> 
> Peter> From: Peter Zijlstra <a.p.zijlstra@chello.nl> People expressed
> Peter> the need to track dirty pages in shared mappings.
> 
> Peter> Linus outlined the general idea of doing that through making
> Peter> clean writable pages write-protected and taking the write
> Peter> fault.
> 
> What does this do to performance on TPC workloads?  How many extra
> faults are there likely to be?

They all use large pages anyway ...

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 2/3] throttle writers of shared mappings
  2006-05-09 20:44               ` [RFC][PATCH 2/3] throttle writers of shared mappings Peter Zijlstra
@ 2006-05-09 22:54                 ` Nick Piggin
  2006-05-09 22:55                   ` Nick Piggin
  0 siblings, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-09 22:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Lameter, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

Peter Zijlstra wrote:

>@@ -2304,8 +2308,11 @@ static inline int handle_pte_fault(struc
> unlock:
> 	pte_unmap_unlock(pte, ptl);
> 	if (dirty_page) {
>+		struct address_space *mapping = page_mapping(dirty_page);
> 		set_page_dirty(dirty_page);
> 		put_page(dirty_page);
>+		if (mapping)
>+			balance_dirty_pages_ratelimited_nr(mapping, 1);
>  
>

Just use balance_dirty_pages()

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 2/3] throttle writers of shared mappings
  2006-05-09 22:54                 ` Nick Piggin
@ 2006-05-09 22:55                   ` Nick Piggin
  2006-05-10  6:25                     ` Peter Zijlstra
  0 siblings, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-09 22:55 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Peter Zijlstra, Christoph Lameter, Linus Torvalds, Andi Kleen,
	Rohit Seth, Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw,
	mel, marcelo, anton, paulmck, linux-mm


Nick Piggin wrote:

> Peter Zijlstra wrote:
>
>> @@ -2304,8 +2308,11 @@ static inline int handle_pte_fault(struc
>> unlock:
>>     pte_unmap_unlock(pte, ptl);
>>     if (dirty_page) {
>> +        struct address_space *mapping = page_mapping(dirty_page);
>>         set_page_dirty(dirty_page);
>>         put_page(dirty_page);
>> +        if (mapping)
>> +            balance_dirty_pages_ratelimited_nr(mapping, 1);
>>  
>>
>
> Just use balance_dirty_pages()


Err.. balance_dirty_pages_ratelimited();

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-09 20:55                   ` Martin Bligh
@ 2006-05-09 22:56                     ` Brian Twichell
  2006-05-10  0:24                     ` Linus Torvalds
  1 sibling, 0 replies; 43+ messages in thread
From: Brian Twichell @ 2006-05-09 22:56 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Peter Chubb, Peter Zijlstra, Christoph Lameter, Nick Piggin,
	Linus Torvalds, Andi Kleen, Rohit Seth, Andrew Morton, hugh, riel,
	andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

Martin Bligh wrote:

> Peter Chubb wrote:
>
>>
>> What does this do to performance on TPC workloads?  How many extra
>> faults are there likely to be?
>
>
> They all use large pages anyway ...
>
> M.
>
That much is true, but let me make sure I understand how this will 
impact customer transaction-processing workloads, where hugepages are 
infrequently used.

Under the proposed scheme, when the database starts up, it will take a 
write fault when it initializes a shared memory page, and the page will 
be marked as dirty.  Then, as long as the page is never swapped out, the 
write protection will never be "re-armed".

Since customers usually tune their databases to never swap, then the 
only potential additional cost I would envision would be the small cost 
of initially marking the page as write-protected, and marking the page 
as dirty during the first fault. 

Does this sound reasonable ?

Cheers,
Brian

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-09 20:55                   ` Martin Bligh
  2006-05-09 22:56                     ` Brian Twichell
@ 2006-05-10  0:24                     ` Linus Torvalds
  2006-05-10  0:29                       ` Nick Piggin
  1 sibling, 1 reply; 43+ messages in thread
From: Linus Torvalds @ 2006-05-10  0:24 UTC (permalink / raw)
  To: Martin Bligh
  Cc: Peter Chubb, Peter Zijlstra, Christoph Lameter, Nick Piggin,
	Andi Kleen, Rohit Seth, Andrew Morton, hugh, riel, andrea, arjan,
	apw, mel, marcelo, anton, paulmck, linux-mm

On Tue, 9 May 2006, Martin Bligh wrote:
> Peter Chubb wrote:
> > 
> > What does this do to performance on TPC workloads?  How many extra
> > faults are there likely to be?
> 
> They all use large pages anyway ...

Yes, but we should really answer that question anyway.

I don't think anybody has really objected to this patch series, and it 
clearly seems to fix something (if nothing else, then Peter's test-program 
;), and for fairly obvious reasons _I_ approve of it.

But I've been asking people to benchmark it anyway. Becuase it _will_ hurt 
users of shared writable mappings, even if it's hopefully less of an issue 
today thanks to large pages. 

Now, the fact that it will hurt them is not a total disaster, since we 
already know that there are ways to mitigate it. However, it's still true 
that we should have the numbers, if only to perhaps be able to say that we 
don't care.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-10  0:24                     ` Linus Torvalds
@ 2006-05-10  0:29                       ` Nick Piggin
  2006-05-10  1:24                         ` Linus Torvalds
  0 siblings, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-10  0:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Martin Bligh, Peter Chubb, Peter Zijlstra, Christoph Lameter,
	Nick Piggin, Andi Kleen, Rohit Seth, Andrew Morton, hugh, riel,
	andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

Linus Torvalds wrote:
> On Tue, 9 May 2006, Martin Bligh wrote:
> 
>>Peter Chubb wrote:
>>
>>>What does this do to performance on TPC workloads?  How many extra
>>>faults are there likely to be?
>>
>>They all use large pages anyway ...
> 
> 
> Yes, but we should really answer that question anyway.
> 
> I don't think anybody has really objected to this patch series, and it 
> clearly seems to fix something (if nothing else, then Peter's test-program 
> ;), and for fairly obvious reasons _I_ approve of it.
> 
> But I've been asking people to benchmark it anyway. Becuase it _will_ hurt 
> users of shared writable mappings, even if it's hopefully less of an issue 
> today thanks to large pages. 
> 
> Now, the fact that it will hurt them is not a total disaster, since we 
> already know that there are ways to mitigate it. However, it's still true 
> that we should have the numbers, if only to perhaps be able to say that we 
> don't care.

Those filesystems (shmem, presumably hugetlbfs) that are
!mapping_cap_account_dirty should not notice any difference, except
perhaps for the several extra branches and bytes of icahce.

Or do these databases use regular filesystem backed shared files?

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-10  0:29                       ` Nick Piggin
@ 2006-05-10  1:24                         ` Linus Torvalds
  0 siblings, 0 replies; 43+ messages in thread
From: Linus Torvalds @ 2006-05-10  1:24 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Martin Bligh, Peter Chubb, Peter Zijlstra, Christoph Lameter,
	Nick Piggin, Andi Kleen, Rohit Seth, Andrew Morton, hugh, riel,
	andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

On Wed, 10 May 2006, Nick Piggin wrote:
> 
> Or do these databases use regular filesystem backed shared files?

Some definitely do.

I don't work much with them, but I'm pretty sure, that's exactly what 
Berkeley DB does. Probably SQLite too (and maybe MySQL?).

And I know that nntpd used to map its "newsgroup list" (which is a 
database, although not a relational one, but a really stupid line-based 
plain-text setup).

So yes, we'll impact performance. Whether it will be really noticeable or 
not, who knows.

The only way to find out may be to just apply the patches (which I'm 
currently planning on doing early after 2.6.17 unless we can get data 
otherwise) and see who screams, if anybody.

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 2/3] throttle writers of shared mappings
  2006-05-09 22:55                   ` Nick Piggin
@ 2006-05-10  6:25                     ` Peter Zijlstra
  0 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-10  6:25 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

On Wed, 2006-05-10 at 08:55 +1000, Nick Piggin wrote:
> Nick Piggin wrote:

> > Just use balance_dirty_pages()
> Err.. balance_dirty_pages_ratelimited();

Ah, tucked away in the header :-)


From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Now that we can detect writers of shared mappings, throttle them.
Avoids OOM by surprise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

 mm/memory.c |    7 +++++++
 1 file changed, 7 insertions(+)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-09 22:27:24.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-10 08:12:40.000000000 +0200
@@ -50,6 +50,7 @@
 #include <linux/init.h>
 #include <linux/mm_page_replace.h>
 #include <linux/backing-dev.h>
+#include <linux/writeback.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -2183,8 +2184,11 @@ retry:
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty_page) {
+		struct address_space *mapping = page_mapping(dirty_page);
 		set_page_dirty(dirty_page);
 		put_page(dirty_page);
+		if (mapping)
+			balance_dirty_pages_ratelimited(mapping);
 	}
 	return ret;
 oom:
@@ -2304,8 +2308,11 @@ static inline int handle_pte_fault(struc
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	if (dirty_page) {
+		struct address_space *mapping = page_mapping(dirty_page);
 		set_page_dirty(dirty_page);
 		put_page(dirty_page);
+		if (mapping)
+			balance_dirty_pages_ratelimited(mapping);
 	}
 	return VM_FAULT_MINOR;
 }


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [RFC][PATCH 3/3] optimize follow_pages()
  2006-05-09 20:44               ` [RFC][PATCH 3/3] optimize follow_pages() Peter Zijlstra
@ 2006-05-10  6:30                 ` Peter Zijlstra
  0 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-10  6:30 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Linus Torvalds, Andi Kleen, Rohit Seth,
	Andrew Morton, mbligh, hugh, riel, andrea, arjan, apw, mel,
	marcelo, anton, paulmck, linux-mm

Ofcourse I got the return paths tangled :-(

---

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

Christoph Lameter suggested I pull set_page_dirty() out from under the 
pte lock.

I reviewed the current calls and found the one in follow_page() a candidate
for the same treatment.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

---

 include/linux/mm.h |    1 +
 mm/memory.c        |   17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2006-05-10 08:11:18.000000000 +0200
+++ linux-2.6/include/linux/mm.h	2006-05-10 08:12:47.000000000 +0200
@@ -1015,6 +1015,7 @@ struct page *follow_page(struct vm_area_
 #define FOLL_TOUCH	0x02	/* mark page accessed */
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
+#define FOLL_DIRTY	0x10	/* the page was dirtied */
 
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2006-05-10 08:12:40.000000000 +0200
+++ linux-2.6/mm/memory.c	2006-05-10 08:16:33.000000000 +0200
@@ -962,18 +962,28 @@ struct page *follow_page(struct vm_area_
 	if (unlikely(!page))
 		goto unlock;
 
-	if (flags & FOLL_GET)
+	if (flags & (FOLL_GET | FOLL_TOUCH))
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
+			flags |= FOLL_DIRTY;
+	}
+
+	pte_unmap_unlock(ptep, ptl);
+
+	if (flags & FOLL_TOUCH) {
+		if (flags & FOLL_DIRTY)
 			set_page_dirty(page);
 		mark_page_accessed(page);
 	}
+	if (!(flags & FOLL_GET))
+		put_page(page);
+	goto out;
+
 unlock:
 	pte_unmap_unlock(ptep, ptl);
-out:
-	return page;
+	goto out;
 
 no_page_table:
 	/*
@@ -986,6 +996,7 @@ no_page_table:
 			get_page(page);
 		BUG_ON(flags & FOLL_WRITE);
 	}
+out:
 	return page;
 }
 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-09 20:44               ` [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4 Peter Zijlstra
  2006-05-09 20:52                 ` Peter Chubb
@ 2006-05-11 15:02                 ` Andrew Morton
  2006-05-11 16:39                   ` Andy Whitcroft
                                     ` (3 more replies)
  1 sibling, 4 replies; 43+ messages in thread
From: Andrew Morton @ 2006-05-11 15:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: clameter, piggin, torvalds, ak, rohitseth, mbligh, hugh, riel,
	andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> 
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> 
> People expressed the need to track dirty pages in shared mappings.
> 
> Linus outlined the general idea of doing that through making clean
> writable pages write-protected and taking the write fault.
> 
> This patch does exactly that, it makes pages in a shared writable
> mapping write-protected. On write-fault the pages are marked dirty and
> made writable. When the pages get synced with their backing store, the
> write-protection is re-instated.
> 
> It survives a simple test and shows the dirty pages in /proc/vmstat.

It'd be nice to have more that a "simple test" done.  Bugs in this area
will be subtle and will manifest in unpleasant ways.  That goes for both
correctness and performance bugs.

> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c	2006-05-08 18:49:39.000000000 +0200
> +++ linux-2.6/mm/memory.c	2006-05-09 09:15:11.000000000 +0200
> @@ -49,6 +49,7 @@
>  #include <linux/module.h>
>  #include <linux/init.h>
>  #include <linux/mm_page_replace.h>
> +#include <linux/backing-dev.h>
>  
>  #include <asm/pgalloc.h>
>  #include <asm/uaccess.h>
> @@ -2077,6 +2078,7 @@ static int do_no_page(struct mm_struct *
>  	unsigned int sequence = 0;
>  	int ret = VM_FAULT_MINOR;
>  	int anon = 0;
> +	struct page *dirty_page = NULL;
>  
>  	pte_unmap(page_table);
>  	BUG_ON(vma->vm_flags & VM_PFNMAP);
> @@ -2150,6 +2152,11 @@ retry:
>  		entry = mk_pte(new_page, vma->vm_page_prot);
>  		if (write_access)
>  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		else if (VM_SharedWritable(vma)) {
> +			struct address_space *mapping = page_mapping(new_page);
> +			if (mapping && mapping_cap_account_dirty(mapping))
> +				entry = pte_wrprotect(entry);
> +		}
>  		set_pte_at(mm, address, page_table, entry);
>  		if (anon) {
>  			inc_mm_counter(mm, anon_rss);
> @@ -2159,6 +2166,10 @@ retry:
>  		} else {
>  			inc_mm_counter(mm, file_rss);
>  			page_add_file_rmap(new_page);
> +			if (write_access) {
> +				dirty_page = new_page;
> +				get_page(dirty_page);
> +			}

So let's see.  We take a write fault, we mark the page dirty then we return
to userspace which will proceed with the write and will mark the pte dirty.

Later, the VM will write the page out.

Later still, the pte will get cleaned by reclaim or by munmap or whatever
and the page will be marked dirty and the page will again be written out. 
Potentially needlessly.

How much extra IO will we be doing because of this change?

>  		return 0;
> Index: linux-2.6/mm/rmap.c
> ===================================================================
> --- linux-2.6.orig/mm/rmap.c	2006-05-08 18:49:39.000000000 +0200
> +++ linux-2.6/mm/rmap.c	2006-05-08 18:53:34.000000000 +0200
> @@ -478,6 +478,72 @@ int page_referenced(struct page *page, i
>  	return referenced;
>  }
>  
> +static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long address;
> +	pte_t *pte, entry;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +
> +	address = vma_address(page, vma);
> +	if (address == -EFAULT)
> +		goto out;
> +
> +	pte = page_check_address(page, mm, address, &ptl);
> +	if (!pte)
> +		goto out;
> +
> +	if (!pte_write(*pte))
> +		goto unlock;
> +
> +	entry = pte_mkclean(pte_wrprotect(*pte));
> +	ptep_establish(vma, address, pte, entry);
> +	update_mmu_cache(vma, address, entry);
> +	lazy_mmu_prot_update(entry);
> +	ret = 1;
> +
> +unlock:
> +	pte_unmap_unlock(pte, ptl);
> +out:
> +	return ret;
> +}
> +
> +static int page_wrprotect_file(struct page *page)
> +{
> +	struct address_space *mapping = page->mapping;
> +	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> +	struct vm_area_struct *vma;
> +	struct prio_tree_iter iter;
> +	int ret = 0;
> +
> +	BUG_ON(PageAnon(page));
> +
> +	spin_lock(&mapping->i_mmap_lock);
> +
> +	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
> +		if (VM_SharedWritable(vma))
> +			ret += page_wrprotect_one(page, vma);
> +	}
> +
> +	spin_unlock(&mapping->i_mmap_lock);
> +	return ret;
> +}
> +
> +int page_wrprotect(struct page *page)
> +{
> +	int ret = 0;
> +
> +	BUG_ON(!PageLocked(page));

hm.  So clear_page_dirty() and clear_page_dirty_for_io() are only ever
called against a locked page?  I guess that makes sense, but it's not a
guarantee which we had in the past.  It really _has_ to be true, because
lock_page() is the only thing which can protect the address_space from
memory reclaim in those two functions.

Oh well.  We'll find out if people's machines start to go BUG.

> +	if (page_mapped(page) && page->mapping) {

umm, afaict this function can be called for swapcache pages and Bad Things
will happen.  I think we need page_mapping(page) here?


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 15:02                 ` Andrew Morton
@ 2006-05-11 16:39                   ` Andy Whitcroft
  2006-05-11 22:52                   ` Christoph Lameter
                                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 43+ messages in thread
From: Andy Whitcroft @ 2006-05-11 16:39 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, clameter, piggin, torvalds, ak, rohitseth, mbligh,
	hugh, riel, andrea, arjan, mel, marcelo, anton, paulmck, linux-mm

Andrew Morton wrote:
> Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
>>
>>From: Peter Zijlstra <a.p.zijlstra@chello.nl>
>>
>>People expressed the need to track dirty pages in shared mappings.
>>
>>Linus outlined the general idea of doing that through making clean
>>writable pages write-protected and taking the write fault.
>>
>>This patch does exactly that, it makes pages in a shared writable
>>mapping write-protected. On write-fault the pages are marked dirty and
>>made writable. When the pages get synced with their backing store, the
>>write-protection is re-instated.
>>
>>It survives a simple test and shows the dirty pages in /proc/vmstat.
> 
> 
> It'd be nice to have more that a "simple test" done.  Bugs in this area
> will be subtle and will manifest in unpleasant ways.  That goes for both
> correctness and performance bugs.

I'll kick off some testing of this stack and see what occurs.  Should
appear on t.k.o in due time.

Cheers.

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 15:02                 ` Andrew Morton
  2006-05-11 16:39                   ` Andy Whitcroft
@ 2006-05-11 22:52                   ` Christoph Lameter
  2006-05-11 23:30                     ` Linus Torvalds
  2006-05-12  1:51                   ` Nick Piggin
  2006-05-12  4:51                   ` Christoph Lameter
  3 siblings, 1 reply; 43+ messages in thread
From: Christoph Lameter @ 2006-05-11 22:52 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, piggin, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

On Thu, 11 May 2006, Andrew Morton wrote:

> > It survives a simple test and shows the dirty pages in /proc/vmstat.
> 
> It'd be nice to have more that a "simple test" done.  Bugs in this area
> will be subtle and will manifest in unpleasant ways.  That goes for both
> correctness and performance bugs.

Standard tests such as AIM7 will not trigger these paths. It is rather
unusual for small unix processes to have a shared writable mapping and 
therefore I doubt that the typical benchmarks may show much of a 
difference. These  types of mappings are more typical for large or 
specialized apps. Be sure that the tests actually do dirty 
pages in shared writeable mappings.

> > +int page_wrprotect(struct page *page)
> > +{
> > +	int ret = 0;
> > +
> > +	BUG_ON(!PageLocked(page));
> 
> hm.  So clear_page_dirty() and clear_page_dirty_for_io() are only ever
> called against a locked page?  I guess that makes sense, but it's not a
> guarantee which we had in the past.  It really _has_ to be true, because
> lock_page() is the only thing which can protect the address_space from
> memory reclaim in those two functions.

If that is true then we can get rid of atomic ops in both functions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 22:52                   ` Christoph Lameter
@ 2006-05-11 23:30                     ` Linus Torvalds
  2006-05-11 23:44                       ` Andrew Morton
  0 siblings, 1 reply; 43+ messages in thread
From: Linus Torvalds @ 2006-05-11 23:30 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Peter Zijlstra, piggin, ak, rohitseth, mbligh,
	hugh, riel, andrea, arjan, apw, mel, marcelo, anton, paulmck,
	linux-mm

On Thu, 11 May 2006, Christoph Lameter wrote:
> On Thu, 11 May 2006, Andrew Morton wrote:
> >
> > It'd be nice to have more that a "simple test" done.  Bugs in this area
> > will be subtle and will manifest in unpleasant ways.  That goes for both
> > correctness and performance bugs.
> 
> Standard tests such as AIM7 will not trigger these paths. It is rather
> unusual for small unix processes to have a shared writable mapping and 
> therefore I doubt that the typical benchmarks may show much of a 
> difference. These  types of mappings are more typical for large or 
> specialized apps. Be sure that the tests actually do dirty 
> pages in shared writeable mappings.

What happened to the VM stress-test programs that we used to test the 
page-out with? I forget who kept a collection of them around, but they did 
things like trying to cause MM problems on purpose. And I'm pretty sure 
some of the nastiest ones used shared mappings, exactly because we've had 
problems with the virtual scanning.

I have a very distinct memory of somebody (I'd like to say Con, but that's 
probably bogus) collecting a few programs that were known to cause nasty 
problems (like the system just becoming totally unresponsive). For
checking that things degraded reasonably before getting killed by OOM.

I'm talking the 2.4.x timeframe, so it's a few years ago. It might not be 
a real _benchmark_ per se, but I think it would be an interesting 
data-point whether the system acts "better" with some of those tests..

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 23:30                     ` Linus Torvalds
@ 2006-05-11 23:44                       ` Andrew Morton
  2006-05-12  0:10                         ` Linus Torvalds
                                           ` (2 more replies)
  0 siblings, 3 replies; 43+ messages in thread
From: Andrew Morton @ 2006-05-11 23:44 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: clameter, a.p.zijlstra, piggin, ak, rohitseth, mbligh, hugh, riel,
	andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

Linus Torvalds <torvalds@osdl.org> wrote:
>
> What happened to the VM stress-test programs that we used to test the 
> page-out with? I forget who kept a collection of them around, but they did 
> things like trying to cause MM problems on purpose.

I think that was me, back in my programming days.

> And I'm pretty sure 
> some of the nastiest ones used shared mappings, exactly because we've had 
> problems with the virtual scanning.

http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz

run-bash-shared-mapping.sh is a good stress-tester and deadlock-finder.

Running fsx-linux (in mmap-read and mmap-write and read and write mode) in
combination with memory pressure is a good correctness-tester.  Needs to be
run on various filesystems too.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 23:44                       ` Andrew Morton
@ 2006-05-12  0:10                         ` Linus Torvalds
  2006-05-12  8:07                         ` Andy Whitcroft
  2006-05-14 15:58                         ` Andy Whitcroft
  2 siblings, 0 replies; 43+ messages in thread
From: Linus Torvalds @ 2006-05-12  0:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: clameter, a.p.zijlstra, piggin, ak, rohitseth, mbligh, hugh, riel,
	andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm


On Thu, 11 May 2006, Andrew Morton wrote:
> 
> I think that was me, back in my programming days.

How times flies ;)

> http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz

usemem was one of the tools I was thinking of, so this may well be it. 

		Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 15:02                 ` Andrew Morton
  2006-05-11 16:39                   ` Andy Whitcroft
  2006-05-11 22:52                   ` Christoph Lameter
@ 2006-05-12  1:51                   ` Nick Piggin
  2006-05-12  4:30                     ` Andrew Morton
  2006-05-12  4:51                   ` Christoph Lameter
  3 siblings, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-12  1:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, clameter, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm


Andrew Morton wrote:

>Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
>>
>>From: Peter Zijlstra <a.p.zijlstra@chello.nl>
>>
>>People expressed the need to track dirty pages in shared mappings.
>>
>>Linus outlined the general idea of doing that through making clean
>>writable pages write-protected and taking the write fault.
>>
>>This patch does exactly that, it makes pages in a shared writable
>>mapping write-protected. On write-fault the pages are marked dirty and
>>made writable. When the pages get synced with their backing store, the
>>write-protection is re-instated.
>>
>>It survives a simple test and shows the dirty pages in /proc/vmstat.
>>
>
>It'd be nice to have more that a "simple test" done.  Bugs in this area
>will be subtle and will manifest in unpleasant ways.  That goes for both
>correctness and performance bugs.
>
>
>>Index: linux-2.6/mm/memory.c
>>===================================================================
>>--- linux-2.6.orig/mm/memory.c	2006-05-08 18:49:39.000000000 +0200
>>+++ linux-2.6/mm/memory.c	2006-05-09 09:15:11.000000000 +0200
>>@@ -49,6 +49,7 @@
>> #include <linux/module.h>
>> #include <linux/init.h>
>> #include <linux/mm_page_replace.h>
>>+#include <linux/backing-dev.h>
>> 
>> #include <asm/pgalloc.h>
>> #include <asm/uaccess.h>
>>@@ -2077,6 +2078,7 @@ static int do_no_page(struct mm_struct *
>> 	unsigned int sequence = 0;
>> 	int ret = VM_FAULT_MINOR;
>> 	int anon = 0;
>>+	struct page *dirty_page = NULL;
>> 
>> 	pte_unmap(page_table);
>> 	BUG_ON(vma->vm_flags & VM_PFNMAP);
>>@@ -2150,6 +2152,11 @@ retry:
>> 		entry = mk_pte(new_page, vma->vm_page_prot);
>> 		if (write_access)
>> 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>+		else if (VM_SharedWritable(vma)) {
>>+			struct address_space *mapping = page_mapping(new_page);
>>+			if (mapping && mapping_cap_account_dirty(mapping))
>>+				entry = pte_wrprotect(entry);
>>+		}
>> 		set_pte_at(mm, address, page_table, entry);
>> 		if (anon) {
>> 			inc_mm_counter(mm, anon_rss);
>>@@ -2159,6 +2166,10 @@ retry:
>> 		} else {
>> 			inc_mm_counter(mm, file_rss);
>> 			page_add_file_rmap(new_page);
>>+			if (write_access) {
>>+				dirty_page = new_page;
>>+				get_page(dirty_page);
>>+			}
>>
>
>So let's see.  We take a write fault, we mark the page dirty then we return
>to userspace which will proceed with the write and will mark the pte dirty.
>
>Later, the VM will write the page out.
>
>Later still, the pte will get cleaned by reclaim or by munmap or whatever
>and the page will be marked dirty and the page will again be written out. 
>Potentially needlessly.
>

page_wrprotect also marks the page clean, so this window is very small.
The window is that the fault path might set_page_dirty, then throttle
on writeout, and the page gets written out before it really gets dirtied
by the application (which then has to fault again).

>
>How much extra IO will we be doing because of this change?
>

Of course it can do potentially quite a lot more IO in some cases, if
an application likes to dirty a working set larger than the writeout
thresholds... the same scenario as write(2) has now.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-12  1:51                   ` Nick Piggin
@ 2006-05-12  4:30                     ` Andrew Morton
  2006-05-12  5:05                       ` Nick Piggin
  2006-05-12  7:06                       ` Peter Zijlstra
  0 siblings, 2 replies; 43+ messages in thread
From: Andrew Morton @ 2006-05-12  4:30 UTC (permalink / raw)
  To: Nick Piggin
  Cc: a.p.zijlstra, clameter, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

Nick Piggin <piggin@cyberone.com.au> wrote:
>
>  >So let's see.  We take a write fault, we mark the page dirty then we return
>  >to userspace which will proceed with the write and will mark the pte dirty.
>  >
>  >Later, the VM will write the page out.
>  >
>  >Later still, the pte will get cleaned by reclaim or by munmap or whatever
>  >and the page will be marked dirty and the page will again be written out. 
>  >Potentially needlessly.
>  >
> 
>  page_wrprotect also marks the page clean,

Oh.  I missed that when reading the comment which describes
page_wrprotect() (I do go on).

> so this window is very small.
>  The window is that the fault path might set_page_dirty, then throttle
>  on writeout, and the page gets written out before it really gets dirtied
>  by the application (which then has to fault again).

: int test_clear_page_dirty(struct page *page)
: {
: 	struct address_space *mapping = page_mapping(page);
: 	unsigned long flags;
: 
: 	if (mapping) {
: 		write_lock_irqsave(&mapping->tree_lock, flags);
: 		if (TestClearPageDirty(page)) {
: 			radix_tree_tag_clear(&mapping->page_tree,
: 						page_index(page),
: 						PAGECACHE_TAG_DIRTY);
: 			write_unlock_irqrestore(&mapping->tree_lock, flags);
: 			/*
: 			 * We can continue to use `mapping' here because the
: 			 * page is locked, which pins the address_space
: 			 */

So if userspace modifies the page right here, and marks the pte dirty.

: 			if (mapping_cap_account_dirty(mapping)) {
: 				page_wrprotect(page);

We just lost that pte dirty bit, and hence the user's data.

: 				dec_page_state(nr_dirty);
: 			}
: 			return 1;
: 		}
: 		write_unlock_irqrestore(&mapping->tree_lock, flags);
: 		return 0;
: 	}
: 	return TestClearPageDirty(page);
: }
: 

Which is just the sort of subtle and nasty problem I was referring to...

If that's correct then I guess we need the

                if (ptep_clear_flush_dirty(vma, addr, pte) ||
                                page_test_and_clear_dirty(page))
                        ret += set_page_dirty(page);

treatment in page_wrprotect().

Now I suppose it's not really a dataloss race, because in practice the
kernel is about to write this page to backing store anwyay.  I guess.  I
cannot immediately think of any clear_page_dirty() callers for whom that
won't be true.

Someone please convince me that this has all been thought about and is solid
as a rock.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 15:02                 ` Andrew Morton
                                     ` (2 preceding siblings ...)
  2006-05-12  1:51                   ` Nick Piggin
@ 2006-05-12  4:51                   ` Christoph Lameter
  3 siblings, 0 replies; 43+ messages in thread
From: Christoph Lameter @ 2006-05-12  4:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Peter Zijlstra, piggin, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

On Thu, 11 May 2006, Andrew Morton wrote:

> So let's see.  We take a write fault, we mark the page dirty then we return
> to userspace which will proceed with the write and will mark the pte dirty.

The pte is marked dirty when the page is marked dirty.

> Later still, the pte will get cleaned by reclaim or by munmap or whatever
> and the page will be marked dirty and the page will again be written out. 
> Potentially needlessly.

But consistent with the way write() works on page buffers. It is rather 
surprising that one can dirty lots of mmapped pages and they are only 
written out when the process terminates.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-12  4:30                     ` Andrew Morton
@ 2006-05-12  5:05                       ` Nick Piggin
  2006-05-12  7:06                       ` Peter Zijlstra
  1 sibling, 0 replies; 43+ messages in thread
From: Nick Piggin @ 2006-05-12  5:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: a.p.zijlstra, clameter, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

Andrew Morton wrote:

>Nick Piggin <piggin@cyberone.com.au> wrote:
>
>> >So let's see.  We take a write fault, we mark the page dirty then we return
>> >to userspace which will proceed with the write and will mark the pte dirty.
>> >
>> >Later, the VM will write the page out.
>> >
>> >Later still, the pte will get cleaned by reclaim or by munmap or whatever
>> >and the page will be marked dirty and the page will again be written out. 
>> >Potentially needlessly.
>> >
>>
>> page_wrprotect also marks the page clean,
>>
>
>Oh.  I missed that when reading the comment which describes
>page_wrprotect() (I do go on).
>

I guess page_wrprotect isn't a good name, because it would suggest it
can be used in situations where it would cause data loss.

page_mappings_mkclean or page_mkclean might be better (the wrprotect
is just a side effect of the fact that clean,shared mappings are
readonly).

>
>>so this window is very small.
>> The window is that the fault path might set_page_dirty, then throttle
>> on writeout, and the page gets written out before it really gets dirtied
>> by the application (which then has to fault again).
>>
>
>: int test_clear_page_dirty(struct page *page)
>: {
>: 	struct address_space *mapping = page_mapping(page);
>: 	unsigned long flags;
>: 
>: 	if (mapping) {
>: 		write_lock_irqsave(&mapping->tree_lock, flags);
>: 		if (TestClearPageDirty(page)) {
>: 			radix_tree_tag_clear(&mapping->page_tree,
>: 						page_index(page),
>: 						PAGECACHE_TAG_DIRTY);
>: 			write_unlock_irqrestore(&mapping->tree_lock, flags);
>: 			/*
>: 			 * We can continue to use `mapping' here because the
>: 			 * page is locked, which pins the address_space
>: 			 */
>
>So if userspace modifies the page right here, and marks the pte dirty.
>
>: 			if (mapping_cap_account_dirty(mapping)) {
>: 				page_wrprotect(page);
>
>We just lost that pte dirty bit, and hence the user's data.
>
>: 				dec_page_state(nr_dirty);
>: 			}
>: 			return 1;
>: 		}
>: 		write_unlock_irqrestore(&mapping->tree_lock, flags);
>: 		return 0;
>: 	}
>: 	return TestClearPageDirty(page);
>: }
>: 
>
>Which is just the sort of subtle and nasty problem I was referring to...
>
>If that's correct then I guess we need the
>
>                if (ptep_clear_flush_dirty(vma, addr, pte) ||
>                                page_test_and_clear_dirty(page))
>                        ret += set_page_dirty(page);
>
>treatment in page_wrprotect().
>
>Now I suppose it's not really a dataloss race, because in practice the
>kernel is about to write this page to backing store anwyay.  I guess.  I
>cannot immediately think of any clear_page_dirty() callers for whom that
>won't be true.
>

If they do a clear_page_dirty, then fail to clean the page, then fail to
subsequently set_page_dirty again, it is a data-loss situation anyway.

If they do set_page_dirty (which they have to, for correctness), then the
page has PG_dirty set again; true dirty bits are moved out of the ptes,
but that's no problem.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-12  4:30                     ` Andrew Morton
  2006-05-12  5:05                       ` Nick Piggin
@ 2006-05-12  7:06                       ` Peter Zijlstra
  2006-05-12  8:04                         ` Nick Piggin
  2006-05-12  8:07                         ` Nick Piggin
  1 sibling, 2 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-12  7:06 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Nick Piggin, clameter, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

On Thu, 2006-05-11 at 21:30 -0700, Andrew Morton wrote:
> Nick Piggin <piggin@cyberone.com.au> wrote:
> >
> >  >So let's see.  We take a write fault, we mark the page dirty then we return
> >  >to userspace which will proceed with the write and will mark the pte dirty.
> >  >
> >  >Later, the VM will write the page out.
> >  >
> >  >Later still, the pte will get cleaned by reclaim or by munmap or whatever
> >  >and the page will be marked dirty and the page will again be written out. 
> >  >Potentially needlessly.
> >  >
> > 
> >  page_wrprotect also marks the page clean,
> 
> Oh.  I missed that when reading the comment which describes
> page_wrprotect() (I do go on).

Yes, this name is not the best of names :-(

I was aware of this, but since in my mind the counting through
protection 
faults was the prime idea, I stuck to page_wrprotect().

But I'm hard pressed to come up with a better one. Nick proposes:
 page_mkclean()
But that also doesn't cover the whole of it from my perspective.

> > so this window is very small.
> >  The window is that the fault path might set_page_dirty, then throttle
> >  on writeout, and the page gets written out before it really gets dirtied
> >  by the application (which then has to fault again).
> 
> : int test_clear_page_dirty(struct page *page)
> : {
> : 	struct address_space *mapping = page_mapping(page);
> : 	unsigned long flags;
> : 
> : 	if (mapping) {
> : 		write_lock_irqsave(&mapping->tree_lock, flags);
> : 		if (TestClearPageDirty(page)) {
> : 			radix_tree_tag_clear(&mapping->page_tree,
> : 						page_index(page),
> : 						PAGECACHE_TAG_DIRTY);
> : 			write_unlock_irqrestore(&mapping->tree_lock, flags);
> : 			/*
> : 			 * We can continue to use `mapping' here because the
> : 			 * page is locked, which pins the address_space
> : 			 */
> 
> So if userspace modifies the page right here, and marks the pte dirty.
> 
> : 			if (mapping_cap_account_dirty(mapping)) {
> : 				page_wrprotect(page);
> 
> We just lost that pte dirty bit, and hence the user's data.

I thought that at the time we clean PAGECACHE_TAG_DIRTY the page is in
flight to disk.
Now that I look at it again, perhaps the page_wrprotect() call in
clear_page_dirty_for_io()
should be in test_set_page_writeback().

> : 				dec_page_state(nr_dirty);
> : 			}
> : 			return 1;
> : 		}
> : 		write_unlock_irqrestore(&mapping->tree_lock, flags);
> : 		return 0;
> : 	}
> : 	return TestClearPageDirty(page);
> : }
> : 
> 
> Which is just the sort of subtle and nasty problem I was referring to...
> 
> If that's correct then I guess we need the
> 
>                 if (ptep_clear_flush_dirty(vma, addr, pte) ||
>                                 page_test_and_clear_dirty(page))
>                         ret += set_page_dirty(page);
> 
> treatment in page_wrprotect().

I would make that a BUG_ON(); the only way for a pte of a shared mapping
to become 
dirty is through the fault handler, and that should already call
set_page_dirty() on it.

> Now I suppose it's not really a dataloss race, because in practice the
> kernel is about to write this page to backing store anwyay.  I guess.  I
> cannot immediately think of any clear_page_dirty() callers for whom that
> won't be true.
> 
> Someone please convince me that this has all been thought about and is solid
> as a rock.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-12  7:06                       ` Peter Zijlstra
@ 2006-05-12  8:04                         ` Nick Piggin
  2006-05-12  8:52                           ` Peter Zijlstra
  2006-05-12  8:07                         ` Nick Piggin
  1 sibling, 1 reply; 43+ messages in thread
From: Nick Piggin @ 2006-05-12  8:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, clameter, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

Peter Zijlstra wrote:

>On Thu, 2006-05-11 at 21:30 -0700, Andrew Morton wrote:
>
>>Nick Piggin <piggin@cyberone.com.au> wrote:
>>
>>> >So let's see.  We take a write fault, we mark the page dirty then we return
>>> >to userspace which will proceed with the write and will mark the pte dirty.
>>> >
>>> >Later, the VM will write the page out.
>>> >
>>> >Later still, the pte will get cleaned by reclaim or by munmap or whatever
>>> >and the page will be marked dirty and the page will again be written out. 
>>> >Potentially needlessly.
>>> >
>>>
>>> page_wrprotect also marks the page clean,
>>>
>>Oh.  I missed that when reading the comment which describes
>>page_wrprotect() (I do go on).
>>
>
>Yes, this name is not the best of names :-(
>
>I was aware of this, but since in my mind the counting through
>protection 
>faults was the prime idea, I stuck to page_wrprotect().
>
>But I'm hard pressed to come up with a better one. Nick proposes:
> page_mkclean()
>But that also doesn't cover the whole of it from my perspective.
>

What's your perspective?

With mmap shared accounting, the _whole VM's_ perspective is that clean
MAP_SHARED ptes are marked readonly.

The logical operation is marking the page's ptes clean. The VM mechanism
also marks the ptes readonly as a side effect of that. Think about it:
writeback does not want to make the page write protected, it wants to make
it clean.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 23:44                       ` Andrew Morton
  2006-05-12  0:10                         ` Linus Torvalds
@ 2006-05-12  8:07                         ` Andy Whitcroft
  2006-05-12 14:25                           ` Martin J. Bligh
  2006-05-14 15:58                         ` Andy Whitcroft
  2 siblings, 1 reply; 43+ messages in thread
From: Andy Whitcroft @ 2006-05-12  8:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, clameter, a.p.zijlstra, piggin, ak, rohitseth,
	mbligh, hugh, riel, andrea, arjan, mel, marcelo, anton, paulmck,
	linux-mm

Andrew Morton wrote:
> Linus Torvalds <torvalds@osdl.org> wrote:
> 
>>What happened to the VM stress-test programs that we used to test the 
>>page-out with? I forget who kept a collection of them around, but they did 
>>things like trying to cause MM problems on purpose.
> 
> 
> I think that was me, back in my programming days.
> 
> 
>>And I'm pretty sure 
>>some of the nastiest ones used shared mappings, exactly because we've had 
>>problems with the virtual scanning.
> 
> 
> http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz
> 
> run-bash-shared-mapping.sh is a good stress-tester and deadlock-finder.
> 
> Running fsx-linux (in mmap-read and mmap-write and read and write mode) in
> combination with memory pressure is a good correctness-tester.  Needs to be
> run on various filesystems too.

Well for what its worth (and from this thread it may not be that much)
the testing I did over night shows green across all the test boxes I
have.  The tests do include fsx-linux across a limited range of filesystems.

I'll see if I can get the other one done.

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-12  7:06                       ` Peter Zijlstra
  2006-05-12  8:04                         ` Nick Piggin
@ 2006-05-12  8:07                         ` Nick Piggin
  1 sibling, 0 replies; 43+ messages in thread
From: Nick Piggin @ 2006-05-12  8:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, clameter, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm


Peter Zijlstra wrote:

>On Thu, 2006-05-11 at 21:30 -0700, Andrew Morton wrote:
>
>>
>>We just lost that pte dirty bit, and hence the user's data.
>>
>
>I thought that at the time we clean PAGECACHE_TAG_DIRTY the page is in
>flight to disk.
>

No.

>Now that I look at it again, perhaps the page_wrprotect() call in
>clear_page_dirty_for_io()
>should be in test_set_page_writeback().
>

No. The logical operation is clearing the dirty bits from the ptes. Such
an operation would be valid even if we didn't set the ptes readonly.

And clearing dirty belongs in clear_page_dirty.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-12  8:04                         ` Nick Piggin
@ 2006-05-12  8:52                           ` Peter Zijlstra
  0 siblings, 0 replies; 43+ messages in thread
From: Peter Zijlstra @ 2006-05-12  8:52 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, clameter, torvalds, ak, rohitseth, mbligh, hugh,
	riel, andrea, arjan, apw, mel, marcelo, anton, paulmck, linux-mm

On Fri, 2006-05-12 at 18:04 +1000, Nick Piggin wrote:
> Peter Zijlstra wrote:
> 
> >On Thu, 2006-05-11 at 21:30 -0700, Andrew Morton wrote:
> >
> >>Nick Piggin <piggin@cyberone.com.au> wrote:
> >>
> >>> >So let's see.  We take a write fault, we mark the page dirty then we return
> >>> >to userspace which will proceed with the write and will mark the pte dirty.
> >>> >
> >>> >Later, the VM will write the page out.
> >>> >
> >>> >Later still, the pte will get cleaned by reclaim or by munmap or whatever
> >>> >and the page will be marked dirty and the page will again be written out. 
> >>> >Potentially needlessly.
> >>> >
> >>>
> >>> page_wrprotect also marks the page clean,
> >>>
> >>Oh.  I missed that when reading the comment which describes
> >>page_wrprotect() (I do go on).
> >>
> >
> >Yes, this name is not the best of names :-(
> >
> >I was aware of this, but since in my mind the counting through
> >protection 
> >faults was the prime idea, I stuck to page_wrprotect().
> >
> >But I'm hard pressed to come up with a better one. Nick proposes:
> > page_mkclean()
> >But that also doesn't cover the whole of it from my perspective.
> >
> 
> What's your perspective?
> 
> With mmap shared accounting, the _whole VM's_ perspective is that clean
> MAP_SHARED ptes are marked readonly.
> 
> The logical operation is marking the page's ptes clean. The VM mechanism
> also marks the ptes readonly as a side effect of that. Think about it:
> writeback does not want to make the page write protected, it wants to make
> it clean.

As said, I was looking at the added functionality on its own; that is, 
counting dirty pages by trapping write faults.

However your view; the big picture; does make more sense. I shall
rename.




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-12  8:07                         ` Andy Whitcroft
@ 2006-05-12 14:25                           ` Martin J. Bligh
  0 siblings, 0 replies; 43+ messages in thread
From: Martin J. Bligh @ 2006-05-12 14:25 UTC (permalink / raw)
  To: Andy Whitcroft
  Cc: Andrew Morton, Linus Torvalds, clameter, a.p.zijlstra, piggin, ak,
	rohitseth, hugh, riel, andrea, arjan, mel, marcelo, anton,
	paulmck, linux-mm

> Well for what its worth (and from this thread it may not be that much)
> the testing I did over night shows green across all the test boxes I
> have.  The tests do include fsx-linux across a limited range of filesystems.

There's no perf regressions anywhere in there either (across dbench, 
reaim, kernbench, tbench, at least) on a multitude of machines ...

M.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4
  2006-05-11 23:44                       ` Andrew Morton
  2006-05-12  0:10                         ` Linus Torvalds
  2006-05-12  8:07                         ` Andy Whitcroft
@ 2006-05-14 15:58                         ` Andy Whitcroft
  2 siblings, 0 replies; 43+ messages in thread
From: Andy Whitcroft @ 2006-05-14 15:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Linus Torvalds, clameter, a.p.zijlstra, piggin, ak, rohitseth,
	mbligh, hugh, riel, andrea, arjan, mel, marcelo, anton, paulmck,
	linux-mm

Andrew Morton wrote:
> Linus Torvalds <torvalds@osdl.org> wrote:
> 
>>What happened to the VM stress-test programs that we used to test the 
>>page-out with? I forget who kept a collection of them around, but they did 
>>things like trying to cause MM problems on purpose.
> 
> 
> I think that was me, back in my programming days.
> 
> 
>>And I'm pretty sure 
>>some of the nastiest ones used shared mappings, exactly because we've had 
>>problems with the virtual scanning.
> 
> 
> http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz
> 
> run-bash-shared-mapping.sh is a good stress-tester and deadlock-finder.

Well my amd64 box ran run-bash-shared-mapping.sh for 6 hours without a
peep or oops.  Machine seemed ok at the end.

-apw

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2006-05-14 15:58 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-05-05 20:35 [RFC][PATCH] tracking dirty pages in shared mappings Peter Zijlstra
2006-05-06 13:18 ` Nick Piggin
2006-05-06 13:34   ` Peter Zijlstra
2006-05-06 13:47     ` Nick Piggin
2006-05-06 15:29       ` Peter Zijlstra
2006-05-07  0:40         ` Nick Piggin
2006-05-07  3:43           ` Nick Piggin
2006-05-08  6:43         ` Christoph Lameter
2006-05-08  7:23           ` Peter Zijlstra
2006-05-08 19:20           ` [RFC][PATCH 1/2] tracking dirty pages in shared mappings -V3 Peter Zijlstra
2006-05-09  5:41             ` Christoph Lameter
2006-05-09  6:06               ` Peter Zijlstra
2006-05-09 20:44               ` [RFC][PATCH 1/3] tracking dirty pages in shared mappings -V4 Peter Zijlstra
2006-05-09 20:52                 ` Peter Chubb
2006-05-09 20:55                   ` Martin Bligh
2006-05-09 22:56                     ` Brian Twichell
2006-05-10  0:24                     ` Linus Torvalds
2006-05-10  0:29                       ` Nick Piggin
2006-05-10  1:24                         ` Linus Torvalds
2006-05-11 15:02                 ` Andrew Morton
2006-05-11 16:39                   ` Andy Whitcroft
2006-05-11 22:52                   ` Christoph Lameter
2006-05-11 23:30                     ` Linus Torvalds
2006-05-11 23:44                       ` Andrew Morton
2006-05-12  0:10                         ` Linus Torvalds
2006-05-12  8:07                         ` Andy Whitcroft
2006-05-12 14:25                           ` Martin J. Bligh
2006-05-14 15:58                         ` Andy Whitcroft
2006-05-12  1:51                   ` Nick Piggin
2006-05-12  4:30                     ` Andrew Morton
2006-05-12  5:05                       ` Nick Piggin
2006-05-12  7:06                       ` Peter Zijlstra
2006-05-12  8:04                         ` Nick Piggin
2006-05-12  8:52                           ` Peter Zijlstra
2006-05-12  8:07                         ` Nick Piggin
2006-05-12  4:51                   ` Christoph Lameter
2006-05-09 20:44               ` [RFC][PATCH 2/3] throttle writers of shared mappings Peter Zijlstra
2006-05-09 22:54                 ` Nick Piggin
2006-05-09 22:55                   ` Nick Piggin
2006-05-10  6:25                     ` Peter Zijlstra
2006-05-09 20:44               ` [RFC][PATCH 3/3] optimize follow_pages() Peter Zijlstra
2006-05-10  6:30                 ` Peter Zijlstra
2006-05-08 19:24           ` [RFC][PATCH 2/2] throttle writers of shared mappings Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).