[-mm PATCH] ocfs2: Shared writeable mmap

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [-mm PATCH] ocfs2: Shared writeable mmap
@ 2006-06-19 23:46 Mark Fasheh
  2006-06-19 23:55 ` [Ocfs2-devel] " Daniel Phillips
  2006-06-20  0:07 ` Andrew Morton
  0 siblings, 2 replies; 9+ messages in thread
From: Mark Fasheh @ 2006-06-19 23:46 UTC (permalink / raw)
  To: dhowells, akpm; +Cc: linux-fsdevel, ocfs2-devel

I finally got some time to sit down and implement an OCFS2 patch to make use
of the ->page_mkwrite() callback added by David Howells' patch (named
'add-page_mkwrite-vm_operations-method.patch' in -mm). The patches, and an
MPI program to test this can be found at:

http://kernel.org/pub/linux/kernel/people/mfasheh/ocfs2/mmap/

There's one bug however, which will cause the test program on one of the
reading nodes to see stale data if it is run several times in a row against
the same file. I have verified that the same thing works fine on a local
file system (ext3). I'm not sure where the issue is, but I have a feeling
I'm doing something bad in ocfs2_data_convert_worker(). Another possibility
is that we missed a place to put the ->page_mkwrite callback.

Unfortunately, I have to step away from this patch for a bit as I have some
higher priority issues to deal with :/ Luckily, it seems to be in a state
which I think warrants it being pushed out to the public for general review,
testing, etc. If anyone is interested, I'd also appreciate any advice or
help regarding the bug -- my VM-foo is very weak :)
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

From: Mark Fasheh <mark.fasheh@oracle.com>

ocfs2: Shared writeable mmap

Implement cluster consistent shared writeable mappings using the
->page_mkwrite() callback.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

---

 fs/ocfs2/dlmglue.c |   10 +++++
 fs/ocfs2/mmap.c    |  100 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 20 deletions(-)

4c6c09a7927affae4616607c9f0da0a95b232baa
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd528..d57860d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2685,6 +2685,15 @@ static void ocfs2_data_convert_worker(st
        	inode = ocfs2_lock_res_inode(lockres);
 	mapping = inode->i_mapping;
 
+	/*
+	 * We need this before the filemap_fdatawrite() so that it can
+	 * transfer the dirty bit from the PTE to the
+	 * page. Unfortunately this means that even for EX->PR
+	 * downconverts, we'll lose our mappings and have to build
+	 * them up again.
+	 */
+	unmap_mapping_range(mapping, 0, 0, 0);
+
 	if (filemap_fdatawrite(mapping)) {
 		mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2692,7 +2701,6 @@ static void ocfs2_data_convert_worker(st
 	sync_mapping_buffers(mapping);
 	if (blocking == LKM_EXMODE) {
 		truncate_inode_pages(mapping, 0);
-		unmap_mapping_range(mapping, 0, 0, 0);
 	} else {
 		/* We only need to wait on the I/O if we're not also
 		 * truncating pages because truncate_inode_pages waits
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 843cf9d..b53063c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -42,6 +42,23 @@ #include "file.h"
 #include "inode.h"
 #include "mmap.h"
 
+static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
+{
+	/* The best way to deal with signals in the vm path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(blocked);
+
+	/* We should technically never get a bad return value
+	 * from sigprocmask */
+	return sigprocmask(SIG_BLOCK, blocked, oldset);
+}
+
+static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
+{
+	return sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
 				 unsigned long address,
 				 int *type)
@@ -53,14 +70,7 @@ static struct page *ocfs2_nopage(struct 
 
 	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
 
-	/* The best way to deal with signals in this path is
-	 * to block them upfront, rather than allowing the
-	 * locking paths to return -ERESTARTSYS. */
-	sigfillset(&blocked);
-
-	/* We should technically never get a bad ret return
-	 * from sigprocmask */
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -68,7 +78,7 @@ static struct page *ocfs2_nopage(struct 
 
 	page = filemap_nopage(area, address, type);
 
-	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	ret = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
@@ -76,21 +86,73 @@ out:
 	return page;
 }
 
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	sigset_t blocked, oldset;
+	int ret, ret2;
+	pgoff_t last_index;
+
+	mlog_entry("(inode %llu, page index %lu)\n",
+		   (unsigned long long)OCFS2_I(inode)->ip_blkno, page->index);
+
+	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Take a meta data lock so that we can test the page location
+	 * against the proper end of file. This particular check may
+	 * be a little paranoid. */
+	ret = ocfs2_meta_lock(inode, NULL, NULL, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_restore_signals;
+	}
+
+	/*
+	 * When we support holes, allocation should be handled here,
+	 * as writepage() is too late to handle ENOSPC issues.
+	 */
+	last_index = i_size_read(inode) << PAGE_CACHE_SHIFT;
+	if (page->index > last_index) {
+		ret = -EFBIG;
+		goto out_meta_unlock;
+	}
+
+	/*
+	 * Take and drop an exclusive data lock here. This will ensure
+	 * that other nodes write out and invalidate their pages for
+	 * this inode. Dlmglue handles caching of the exclusive lock,
+	 * so the page can be safely marked writeable until another
+	 * node notifies us of competing access.
+	 */
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+	else
+		ocfs2_data_unlock(inode, 1);
+
+out_meta_unlock:
+	ocfs2_meta_unlock(inode, 0);
+
+out_restore_signals:
+	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+	if (ret2 < 0)
+		mlog_errno(ret2);
+
+out:
+	return ret;
+}
+
 static struct vm_operations_struct ocfs2_file_vm_ops = {
-	.nopage = ocfs2_nopage,
+	.nopage		= ocfs2_nopage,
+	.page_mkwrite	= ocfs2_page_mkwrite,
 };
 
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	/* We don't want to support shared writable mappings yet. */
-	if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
-	    && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
-		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
-		/* This is -EINVAL because generic_file_readonly_mmap
-		 * returns it in a similar situation. */
-		return -EINVAL;
-	}
-
 	file_accessed(file);
 	vma->vm_ops = &ocfs2_file_vm_ops;
 	return 0;
-- 
1.3.3

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [Ocfs2-devel] [-mm PATCH] ocfs2: Shared writeable mmap
  2006-06-19 23:46 [-mm PATCH] ocfs2: Shared writeable mmap Mark Fasheh
@ 2006-06-19 23:55 ` Daniel Phillips
  2006-06-20  5:42   ` Mark Fasheh
  2006-06-20  0:07 ` Andrew Morton
  1 sibling, 1 reply; 9+ messages in thread
From: Daniel Phillips @ 2006-06-19 23:55 UTC (permalink / raw)
  To: Mark Fasheh; +Cc: dhowells, akpm, linux-fsdevel, ocfs2-devel

Mark Fasheh wrote:
> I finally got some time to sit down and implement an OCFS2 patch to make use
> of the ->page_mkwrite() callback added by David Howells' patch (named
> 'add-page_mkwrite-vm_operations-method.patch' in -mm). The patches, and an
> MPI program to test this can be found at:
> 
> http://kernel.org/pub/linux/kernel/people/mfasheh/ocfs2/mmap/
> 
> There's one bug however, which will cause the test program on one of the
> reading nodes to see stale data if it is run several times in a row against
> the same file. I have verified that the same thing works fine on a local
> file system (ext3). I'm not sure where the issue is, but I have a feeling
> I'm doing something bad in ocfs2_data_convert_worker(). Another possibility
> is that we missed a place to put the ->page_mkwrite callback.
> 
> Unfortunately, I have to step away from this patch for a bit as I have some
> higher priority issues to deal with :/ Luckily, it seems to be in a state
> which I think warrants it being pushed out to the public for general review,
> testing, etc. If anyone is interested, I'd also appreciate any advice or
> help regarding the bug -- my VM-foo is very weak :)
> 	--Mark

Hi Mark,

While this may be a great patch, you didn't actually explain what it does,
how it does it or why it does it.

Regards,

Daniel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [-mm PATCH] ocfs2: Shared writeable mmap
  2006-06-19 23:46 [-mm PATCH] ocfs2: Shared writeable mmap Mark Fasheh
  2006-06-19 23:55 ` [Ocfs2-devel] " Daniel Phillips
@ 2006-06-20  0:07 ` Andrew Morton
  2006-06-20  0:52   ` Mark Fasheh
                     ` (4 more replies)
  1 sibling, 5 replies; 9+ messages in thread
From: Andrew Morton @ 2006-06-20  0:07 UTC (permalink / raw)
  To: Mark Fasheh; +Cc: dhowells, linux-fsdevel, ocfs2-devel, Peter Zijlstra

Mark Fasheh <mark.fasheh@oracle.com> wrote:
>
> I finally got some time to sit down and implement an OCFS2 patch to make use
> of the ->page_mkwrite() callback added by David Howells' patch (named
> 'add-page_mkwrite-vm_operations-method.patch' in -mm). The patches, and an
> MPI program to test this can be found at:
> 
> http://kernel.org/pub/linux/kernel/people/mfasheh/ocfs2/mmap/
> 
> There's one bug however, which will cause the test program on one of the
> reading nodes to see stale data if it is run several times in a row against
> the same file. I have verified that the same thing works fine on a local
> file system (ext3). I'm not sure where the issue is, but I have a feeling
> I'm doing something bad in ocfs2_data_convert_worker(). Another possibility
> is that we missed a place to put the ->page_mkwrite callback.
> 
> Unfortunately, I have to step away from this patch for a bit as I have some
> higher priority issues to deal with :/ Luckily, it seems to be in a state
> which I think warrants it being pushed out to the public for general review,
> testing, etc. If anyone is interested, I'd also appreciate any advice or
> help regarding the bug -- my VM-foo is very weak :)

Peter Zijlstra told me yesterday:

  There is a problem with the page-mkwrite last posted to lkml.  /me
  checks your tree...  Yeah, that version has a problem:
  http://kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.17-rc6/2.6.17-rc6-mm2/broken-out/add-page_mkwrite-vm_operations-method.patch

  The thing is that get_user_pages(.write=1, .force=1) can generate COW
  hits on read-only shared mappings, this patch traps those as mkpage_write
  candidates and fails to handle them the old way.

Which I was unaware of and haven't started to think about.  Probably I'll
drop the existing patch and pick up the one he's sending out.  But it's
presumably based on top of all the dirty-page-tracking patches which I also
haven't thought about yet and which need _serious_ thought.

It would be better to get a fix against the existing
add-page_mkwrite-vm_operations-method.patch so at least we can get that
merged up.  But nobody seems to be offering that.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [-mm PATCH] ocfs2: Shared writeable mmap
  2006-06-20  0:07 ` Andrew Morton
@ 2006-06-20  0:52   ` Mark Fasheh
  2006-06-20  7:07   ` Peter Zijlstra
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Mark Fasheh @ 2006-06-20  0:52 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dhowells, linux-fsdevel, Peter Zijlstra, ocfs2-devel

On Mon, Jun 19, 2006 at 05:07:36PM -0700, Andrew Morton wrote:
>   The thing is that get_user_pages(.write=1, .force=1) can generate COW
>   hits on read-only shared mappings, this patch traps those as mkpage_write
>   candidates and fails to handle them the old way.
Ahh ok. Too bad this doesn't seem like it could be causing my bug :( Can I
ask what the old way of handling this is?

> It would be better to get a fix against the existing
> add-page_mkwrite-vm_operations-method.patch so at least we can get that
> merged up.  But nobody seems to be offering that.
Were I capable, I'd certainly offer up a patch :/ Definitely though, I agree
that sorting this out first seems like a reasonable way to go.
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [-mm PATCH] ocfs2: Shared writeable mmap
  2006-06-19 23:55 ` [Ocfs2-devel] " Daniel Phillips
@ 2006-06-20  5:42   ` Mark Fasheh
  0 siblings, 0 replies; 9+ messages in thread
From: Mark Fasheh @ 2006-06-20  5:42 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: dhowells, akpm, linux-fsdevel, ocfs2-devel

On Mon, Jun 19, 2006 at 04:55:14PM -0700, Daniel Phillips wrote:
> While this may be a great patch, you didn't actually explain what it does,
> how it does it or why it does it.
Essentially data in OCFS2 is already covered by a cluster lock. We simply
make use of the lock in ->page_mkwrite to ensure that data written via
shared writeable mmap will be coherent with respect to other nodes.

Cluster locks can have levels of NL (no lock), PR (protected read, aka
shared) or EX (exclusive). The act of taking the lock on one node triggers
actions on the other nodes which have competing lock levels. In the case of
locks covering inode data, this means that the other nodes will do a
combination of sync and invalidate against their pages. This is what
ocfs2_data_convert_worker() does.

So the patch just makes use of the callback to take a data lock on the
associated inode. We also take a meta data lock to verify inode size against
the page index, but I'm not completely sure that piece is absolutely
necessary as we should've already done it from ->nopage.
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [-mm PATCH] ocfs2: Shared writeable mmap
  2006-06-20  0:07 ` Andrew Morton
  2006-06-20  0:52   ` Mark Fasheh
@ 2006-06-20  7:07   ` Peter Zijlstra
  2006-06-20 12:59   ` David Howells
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Peter Zijlstra @ 2006-06-20  7:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Mark Fasheh, dhowells, linux-fsdevel, ocfs2-devel

On Mon, 2006-06-19 at 17:07 -0700, Andrew Morton wrote:
> Mark Fasheh <mark.fasheh@oracle.com> wrote:
> >
> > I finally got some time to sit down and implement an OCFS2 patch to make use
> > of the ->page_mkwrite() callback added by David Howells' patch (named
> > 'add-page_mkwrite-vm_operations-method.patch' in -mm). The patches, and an
> > MPI program to test this can be found at:
> > 
> > http://kernel.org/pub/linux/kernel/people/mfasheh/ocfs2/mmap/
> > 
> > There's one bug however, which will cause the test program on one of the
> > reading nodes to see stale data if it is run several times in a row against
> > the same file. I have verified that the same thing works fine on a local
> > file system (ext3). I'm not sure where the issue is, but I have a feeling
> > I'm doing something bad in ocfs2_data_convert_worker(). Another possibility
> > is that we missed a place to put the ->page_mkwrite callback.
> > 
> > Unfortunately, I have to step away from this patch for a bit as I have some
> > higher priority issues to deal with :/ Luckily, it seems to be in a state
> > which I think warrants it being pushed out to the public for general review,
> > testing, etc. If anyone is interested, I'd also appreciate any advice or
> > help regarding the bug -- my VM-foo is very weak :)
> 
> Peter Zijlstra told me yesterday:
> 
>   There is a problem with the page-mkwrite last posted to lkml.  /me
>   checks your tree...  Yeah, that version has a problem:
>   http://kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.17-rc6/2.6.17-rc6-mm2/broken-out/add-page_mkwrite-vm_operations-method.patch
> 
>   The thing is that get_user_pages(.write=1, .force=1) can generate COW
>   hits on read-only shared mappings, this patch traps those as mkpage_write
>   candidates and fails to handle them the old way.

The -v9 version of the dirty page tracking I send out fixes this problem
by affiliation; the following patch should also be enough:

---
 mm/memory.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: 2.6-mm/mm/memory.c
===================================================================
--- 2.6-mm.orig/mm/memory.c	2006-06-20 09:02:58.000000000 +0200
+++ 2.6-mm/mm/memory.c	2006-06-20 09:06:01.000000000 +0200
@@ -1464,7 +1464,8 @@ static int do_wp_page(struct mm_struct *
 	if (!old_page)
 		goto gotten;
 
-	if (unlikely(vma->vm_flags & VM_SHARED)) {
+	if (unlikely(vma->vm_flags & (VM_SHARED|VM_WRITE) ==
+				VM_SHARED|VM_WRITE) {
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
 			/*
 			 * Notify the address space that the page is about to



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [-mm PATCH] ocfs2: Shared writeable mmap
  2006-06-20  0:07 ` Andrew Morton
  2006-06-20  0:52   ` Mark Fasheh
  2006-06-20  7:07   ` Peter Zijlstra
@ 2006-06-20 12:59   ` David Howells
  2006-06-20 13:02   ` David Howells
  2006-06-20 13:20   ` [PATCH] Add notification of page becoming writable to VMA ops David Howells
  4 siblings, 0 replies; 9+ messages in thread
From: David Howells @ 2006-06-20 12:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andrew Morton, Mark Fasheh, dhowells, linux-fsdevel, ocfs2-devel

Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> -	if (unlikely(vma->vm_flags & VM_SHARED)) {
> +	if (unlikely(vma->vm_flags & (VM_SHARED|VM_WRITE) ==
> +				VM_SHARED|VM_WRITE) {

NAK!

"==" is higher priority than "|".  What you meant was:

-	if (unlikely(vma->vm_flags & VM_SHARED)) {
+	if (unlikely(vma->vm_flags & (VM_SHARED|VM_WRITE) ==
+				(VM_SHARED|VM_WRITE)) {

David

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [-mm PATCH] ocfs2: Shared writeable mmap
  2006-06-20  0:07 ` Andrew Morton
                     ` (2 preceding siblings ...)
  2006-06-20 12:59   ` David Howells
@ 2006-06-20 13:02   ` David Howells
  2006-06-20 13:20   ` [PATCH] Add notification of page becoming writable to VMA ops David Howells
  4 siblings, 0 replies; 9+ messages in thread
From: David Howells @ 2006-06-20 13:02 UTC (permalink / raw)
  To: David Howells
  Cc: Peter Zijlstra, Andrew Morton, Mark Fasheh, linux-fsdevel,
	ocfs2-devel

David Howells <dhowells@redhat.com> wrote:

> Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> 
> > -	if (unlikely(vma->vm_flags & VM_SHARED)) {
> > +	if (unlikely(vma->vm_flags & (VM_SHARED|VM_WRITE) ==
> > +				VM_SHARED|VM_WRITE) {
> 
> NAK!
> 
> "==" is higher priority than "|".  What you meant was:
> 
> -	if (unlikely(vma->vm_flags & VM_SHARED)) {
> +	if (unlikely(vma->vm_flags & (VM_SHARED|VM_WRITE) ==
> +				(VM_SHARED|VM_WRITE)) {

Or, rather:

-	if (unlikely(vma->vm_flags & VM_SHARED)) {
+	if (unlikely(vma->vm_flags & (VM_SHARED|VM_WRITE) ==
+				(VM_SHARED|VM_WRITE))) {

It has insufficient closing brackets otherwise.

David

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] Add notification of page becoming writable to VMA ops
  2006-06-20  0:07 ` Andrew Morton
                     ` (3 preceding siblings ...)
  2006-06-20 13:02   ` David Howells
@ 2006-06-20 13:20   ` David Howells
  4 siblings, 0 replies; 9+ messages in thread
From: David Howells @ 2006-06-20 13:20 UTC (permalink / raw)
  To: David Howells
  Cc: Peter Zijlstra, Andrew Morton, Mark Fasheh, linux-fsdevel,
	ocfs2-devel


From: David Howells <dhowells@redhat.com>

The attached patch adds a new VMA operation to notify a filesystem or other
driver about the MMU generating a fault because userspace attempted to write
to a page mapped through a read-only PTE.

This facility permits the filesystem or driver to:

 (*) Implement storage allocation/reservation on attempted write, and so to
     deal with problems such as ENOSPC more gracefully (perhaps by generating
     SIGBUS).

 (*) Delay making the page writable until the contents have been written to a
     backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS.
     It permits the filesystem to have some guarantee about the state of the
     cache.

 (*) Account and limit number of dirty pages. This is one piece of the puzzle
     needed to make shared writable mapping work safely in FUSE.

The patch has been fixed to only check pages in shared writable VMAs in
do_wp_page(), not in all shared VMAs.

Signed-Off-By: David Howells <dhowells@redhat.com>
---

 include/linux/mm.h |    4 ++
 mm/memory.c        |  100 ++++++++++++++++++++++++++++++++++++++++------------
 mm/mmap.c          |   12 +++++-
 mm/mprotect.c      |   11 +++++-
 4 files changed, 99 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1154684..cd3c2cf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -200,6 +200,10 @@ struct vm_operations_struct {
 	void (*close)(struct vm_area_struct * area);
 	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
 	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+
+	/* notification that a previously read-only page is about to become
+	 * writable, if an error is returned it will cause a SIGBUS */
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
 #ifdef CONFIG_NUMA
 	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 0ec7bc6..b7d7e29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1445,25 +1445,60 @@ static int do_wp_page(struct mm_struct *
 {
 	struct page *old_page, *new_page;
 	pte_t entry;
-	int ret = VM_FAULT_MINOR;
+	int reuse, ret = VM_FAULT_MINOR;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
 	if (!old_page)
 		goto gotten;
 
-	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
-		int reuse = can_share_swap_page(old_page);
-		unlock_page(old_page);
-		if (reuse) {
-			flush_cache_page(vma, address, pte_pfn(orig_pte));
-			entry = pte_mkyoung(orig_pte);
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-			ptep_set_access_flags(vma, address, page_table, entry, 1);
-			update_mmu_cache(vma, address, entry);
-			lazy_mmu_prot_update(entry);
-			ret |= VM_FAULT_WRITE;
-			goto unlock;
+	if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
+		     (VM_SHARED|VM_WRITE))) {
+		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			/*
+			 * Notify the address space that the page is about to
+			 * become writable so that it can prohibit this or wait
+			 * for the page to get into an appropriate state.
+			 *
+			 * We do this without the lock held, so that it can
+			 * sleep if it needs to.
+			 */
+			page_cache_get(old_page);
+			pte_unmap_unlock(page_table, ptl);
+
+			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+				goto unwritable_page;
+
+			page_cache_release(old_page);
+
+			/*
+			 * Since we dropped the lock we need to revalidate
+			 * the PTE as someone else may have changed it.  If
+			 * they did, we just return, as we can count on the
+			 * MMU to tell us if they didn't also make it writable.
+			 */
+			page_table = pte_offset_map_lock(mm, pmd, address,
+							 &ptl);
+			if (!pte_same(*page_table, orig_pte))
+				goto unlock;
 		}
+
+		reuse = 1;
+	} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+		reuse = can_share_swap_page(old_page);
+		unlock_page(old_page);
+	} else {
+		reuse = 0;
+	}
+
+	if (reuse) {
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
+		entry = pte_mkyoung(orig_pte);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		ptep_set_access_flags(vma, address, page_table, entry, 1);
+		update_mmu_cache(vma, address, entry);
+		lazy_mmu_prot_update(entry);
+		ret |= VM_FAULT_WRITE;
+		goto unlock;
 	}
 
 	/*
@@ -1523,6 +1558,10 @@ oom:
 	if (old_page)
 		page_cache_release(old_page);
 	return VM_FAULT_OOM;
+
+unwritable_page:
+	page_cache_release(old_page);
+	return VM_FAULT_SIGBUS;
 }
 
 /*
@@ -2074,18 +2113,31 @@ retry:
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	if (write_access && !(vma->vm_flags & VM_SHARED)) {
-		struct page *page;
+	if (write_access) {
+		if (!(vma->vm_flags & VM_SHARED)) {
+			struct page *page;
 
-		if (unlikely(anon_vma_prepare(vma)))
-			goto oom;
-		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-		if (!page)
-			goto oom;
-		copy_user_highpage(page, new_page, address);
-		page_cache_release(new_page);
-		new_page = page;
-		anon = 1;
+			if (unlikely(anon_vma_prepare(vma)))
+				goto oom;
+			page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+			if (!page)
+				goto oom;
+			copy_user_highpage(page, new_page, address);
+			page_cache_release(new_page);
+			new_page = page;
+			anon = 1;
+
+		} else {
+			/* if the page will be shareable, see if the backing
+			 * address space wants to know that the page is about
+			 * to become writable */
+			if (vma->vm_ops->page_mkwrite &&
+			    vma->vm_ops->page_mkwrite(vma, new_page) < 0
+			    ) {
+				page_cache_release(new_page);
+				return VM_FAULT_SIGBUS;
+			}
+		}
 	}
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee123..6446c61 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 	vma->vm_flags = vm_flags;
-	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+	vma->vm_page_prot = protection_map[vm_flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
 			goto free_vma;
 	}
 
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		vma->vm_page_prot =
+			protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
+
 	/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 	 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 	 * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr,
 	vma->vm_end = addr + len;
 	vma->vm_pgoff = pgoff;
 	vma->vm_flags = flags;
-	vma->vm_page_prot = protection_map[flags & 0x0f];
+	vma->vm_page_prot = protection_map[flags &
+				(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
 	vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c14d42..2697abd 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -106,6 +106,7 @@ mprotect_fixup(struct vm_area_struct *vm
 	unsigned long oldflags = vma->vm_flags;
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned long charged = 0;
+	unsigned int mask;
 	pgprot_t newprot;
 	pgoff_t pgoff;
 	int error;
@@ -132,8 +133,6 @@ mprotect_fixup(struct vm_area_struct *vm
 		}
 	}
 
-	newprot = protection_map[newflags & 0xf];
-
 	/*
 	 * First try to merge with previous and/or next vma.
 	 */
@@ -160,6 +159,14 @@ mprotect_fixup(struct vm_area_struct *vm
 	}
 
 success:
+	/* Don't make the VMA automatically writable if it's shared, but the
+	 * backer wishes to know when pages are first written to */
+	mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
+	if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+		mask &= ~VM_SHARED;
+
+	newprot = protection_map[newflags & mask];
+
 	/*
 	 * vm_flags and vm_page_prot are protected by the mmap_sem
 	 * held in write mode.

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2006-06-20 13:20 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-19 23:46 [-mm PATCH] ocfs2: Shared writeable mmap Mark Fasheh
2006-06-19 23:55 ` [Ocfs2-devel] " Daniel Phillips
2006-06-20  5:42   ` Mark Fasheh
2006-06-20  0:07 ` Andrew Morton
2006-06-20  0:52   ` Mark Fasheh
2006-06-20  7:07   ` Peter Zijlstra
2006-06-20 12:59   ` David Howells
2006-06-20 13:02   ` David Howells
2006-06-20 13:20   ` [PATCH] Add notification of page becoming writable to VMA ops David Howells

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).