[patch 09/28] mm: page_mkwrite change prototype to match fault

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 09/28] mm: page_mkwrite change prototype to match fault
       [not found] ` <20090514225413.GA705@kroah.com>
@ 2009-05-14 22:51   ` Greg KH
  2009-05-14 22:51   ` [patch 10/28] fs: fix page_mkwrite error cases in core code and btrfs Greg KH
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 6+ messages in thread
From: Greg KH @ 2009-05-14 22:51 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Justin Forbes, Zwane Mwaikambo, Theodore Ts'o, Randy Dunlap,
	Dave Jones, Chuck Wolber, Chris Wedgwood, Michael Krufky,
	Chuck Ebbert, Domenico Andreoli, Willy Tarreau,
	Rodrigo Rubira Branco, Jake Edge, Eugene Teo, torvalds, akpm,
	alan, linux-fsdevel, linux-mm, Nick Piggin, Chris Mason,
	Trond Myklebust, Miklos Szeredi, Steven Whitehouse, Mark Fasheh,
	Joel 

[-- Attachment #1: mm-page_mkwrite-change-prototype-to-match-fault.patch --]
[-- Type: text/plain, Size: 11624 bytes --]

2.6.27-stable review patch.  If anyone has any objections, please let us know.

------------------

From: Nick Piggin <npiggin@suse.de>

commit c2ec175c39f62949438354f603f4aa170846aabb upstream


mm: page_mkwrite change prototype to match fault

Change the page_mkwrite prototype to take a struct vm_fault, and return
VM_FAULT_xxx flags.  There should be no functional change.

This makes it possible to return much more detailed error information to
the VM (and also can provide more information eg.  virtual_address to the
driver, which might be important in some special cases).

This is required for a subsequent fix.  And will also make it easier to
merge page_mkwrite() with fault() in future.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Artem Bityutskiy <dedekind@infradead.org>
Cc: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 Documentation/filesystems/Locking |    2 +-
 drivers/video/fb_defio.c          |    3 ++-
 fs/buffer.c                       |    6 +++++-
 fs/ext4/ext4.h                    |    2 +-
 fs/ext4/inode.c                   |    5 ++++-
 fs/fuse/file.c                    |    3 ++-
 fs/gfs2/ops_file.c                |    5 ++++-
 fs/nfs/file.c                     |    5 ++++-
 fs/ocfs2/mmap.c                   |    6 ++++--
 fs/ubifs/file.c                   |    9 ++++++---
 fs/xfs/linux-2.6/xfs_file.c       |    4 ++--
 include/linux/buffer_head.h       |    2 +-
 include/linux/mm.h                |    3 ++-
 mm/memory.c                       |   26 ++++++++++++++++++++++----
 14 files changed, 60 insertions(+), 21 deletions(-)

--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -502,7 +502,7 @@ prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
 	int (*fault)(struct vm_area_struct*, struct vm_fault *);
-	int (*page_mkwrite)(struct vm_area_struct *, struct page *);
+	int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
 	int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 
 locking rules:
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -70,8 +70,9 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync);
 
 /* vm_ops->page_mkwrite handler */
 static int fb_deferred_io_mkwrite(struct vm_area_struct *vma,
-				  struct page *page)
+				  struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct fb_info *info = vma->vm_private_data;
 	struct fb_deferred_io *fbdefio = info->fbdefio;
 	struct page *cur;
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2402,9 +2402,10 @@ int block_commit_write(struct page *page
  * unlock the page.
  */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 		   get_block_t get_block)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
@@ -2429,6 +2430,9 @@ block_page_mkwrite(struct vm_area_struct
 		ret = block_commit_write(page, 0, end);
 
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
+
 	unlock_page(page);
 	return ret;
 }
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1084,7 +1084,7 @@ extern int ext4_meta_trans_blocks(struct
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4861,8 +4861,9 @@ static int ext4_bh_unmapped(handle_t *ha
 	return !buffer_mapped(bh);
 }
 
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
@@ -4913,6 +4914,8 @@ int ext4_page_mkwrite(struct vm_area_str
 		goto out_unlock;
 	ret = 0;
 out_unlock:
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	up_read(&inode->i_alloc_sem);
 	return ret;
 }
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1219,8 +1219,9 @@ static void fuse_vma_close(struct vm_are
  * - sync(2)
  * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
  */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	/*
 	 * Don't use page->mapping as it may become NULL from a
 	 * concurrent truncate.
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -338,8 +338,9 @@ static int gfs2_allocate_page_backing(st
  * blocks allocated on disk to back that page.
  */
 
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -411,6 +412,8 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -448,8 +448,9 @@ const struct address_space_operations nf
 	.launder_page = nfs_launder_page,
 };
 
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
 	struct dentry *dentry = filp->f_path.dentry;
 	unsigned pagelen;
@@ -480,6 +481,8 @@ static int nfs_vm_page_mkwrite(struct vm
 		ret = pagelen;
 out_unlock:
 	unlock_page(page);
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -150,8 +150,9 @@ out:
 	return ret;
 }
 
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct buffer_head *di_bh = NULL;
 	sigset_t blocked, oldset;
@@ -192,7 +193,8 @@ out:
 	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret2 < 0)
 		mlog_errno(ret2);
-
+	if (ret)
+		ret = VM_FAULT_SIGBUS;
 	return ret;
 }
 
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1140,8 +1140,9 @@ static int ubifs_releasepage(struct page
  * mmap()d file has taken write protection fault and is being made
  * writable. UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec now = ubifs_current_time(inode);
@@ -1153,7 +1154,7 @@ static int ubifs_vm_page_mkwrite(struct 
 	ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
 
 	if (unlikely(c->ro_media))
-		return -EROFS;
+		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
 	 * We have not locked @page so far so we may budget for changing the
@@ -1186,7 +1187,7 @@ static int ubifs_vm_page_mkwrite(struct 
 		if (err == -ENOSPC)
 			ubifs_warn("out of space for mmapped file "
 				   "(inode number %lu)", inode->i_ino);
-		return err;
+		return VM_FAULT_SIGBUS;
 	}
 
 	lock_page(page);
@@ -1226,6 +1227,8 @@ static int ubifs_vm_page_mkwrite(struct 
 out_unlock:
 	unlock_page(page);
 	ubifs_release_budget(c, &req);
+	if (err)
+		err = VM_FAULT_SIGBUS;
 	return err;
 }
 
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -427,9 +427,9 @@ xfs_file_ioctl_invis(
 STATIC int
 xfs_vm_page_mkwrite(
 	struct vm_area_struct	*vma,
-	struct page		*page)
+	struct vm_fault		*vmf)
 {
-	return block_page_mkwrite(vma, page, xfs_get_blocks);
+	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
 const struct file_operations xfs_file_operations = {
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -222,7 +222,7 @@ int cont_write_begin(struct file *, stru
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
-int block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
 void block_sync_page(struct page *);
 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -138,6 +138,7 @@ extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
 #define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
+#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
 
 
 /*
@@ -173,7 +174,7 @@ struct vm_operations_struct {
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
-	int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
+	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1801,6 +1801,15 @@ static int do_wp_page(struct mm_struct *
 		 * get_user_pages(.write=1, .force=1).
 		 */
 		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+			struct vm_fault vmf;
+			int tmp;
+
+			vmf.virtual_address = (void __user *)(address &
+								PAGE_MASK);
+			vmf.pgoff = old_page->index;
+			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+			vmf.page = old_page;
+
 			/*
 			 * Notify the address space that the page is about to
 			 * become writable so that it can prohibit this or wait
@@ -1812,8 +1821,12 @@ static int do_wp_page(struct mm_struct *
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
 
-			if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+			if (unlikely(tmp &
+					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+				ret = tmp;
 				goto unwritable_page;
+			}
 
 			/*
 			 * Since we dropped the lock we need to revalidate
@@ -1955,7 +1968,7 @@ oom:
 
 unwritable_page:
 	page_cache_release(old_page);
-	return VM_FAULT_SIGBUS;
+	return ret;
 }
 
 /*
@@ -2472,9 +2485,14 @@ static int __do_fault(struct mm_struct *
 			 * to become writable
 			 */
 			if (vma->vm_ops->page_mkwrite) {
+				int tmp;
+
 				unlock_page(page);
-				if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
-					ret = VM_FAULT_SIGBUS;
+				vmf.flags |= FAULT_FLAG_MKWRITE;
+				tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+				if (unlikely(tmp &
+					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+					ret = tmp;
 					anon = 1; /* no anon but release vmf.page */
 					goto out_unlocked;
 				}



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [patch 10/28] fs: fix page_mkwrite error cases in core code and btrfs
       [not found] ` <20090514225413.GA705@kroah.com>
  2009-05-14 22:51   ` [patch 09/28] mm: page_mkwrite change prototype to match fault Greg KH
@ 2009-05-14 22:51   ` Greg KH
  2009-05-14 22:51   ` [patch 11/28] mm: close page_mkwrite races Greg KH
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 6+ messages in thread
From: Greg KH @ 2009-05-14 22:51 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Justin Forbes, Zwane Mwaikambo, Theodore Ts'o, Randy Dunlap,
	Dave Jones, Chuck Wolber, Chris Wedgwood, Michael Krufky,
	Chuck Ebbert, Domenico Andreoli, Willy Tarreau,
	Rodrigo Rubira Branco, Jake Edge, Eugene Teo, torvalds, akpm,
	alan, linux-fsdevel, linux-mm, Chris Mason, Nick Piggin

[-- Attachment #1: fs-fix-page_mkwrite-error-cases-in-core-code-and-btrfs.patch --]
[-- Type: text/plain, Size: 2330 bytes --]

2.6.27-stable review patch.  If anyone has any objections, please let us know.

------------------

From: Nick Piggin <npiggin@suse.de>

commit 56a76f8275c379ed73c8a43cfa1dfa2f5e9cfa19 upstream

fs: fix page_mkwrite error cases in core code and btrfs

page_mkwrite is called with neither the page lock nor the ptl held.  This
means a page can be concurrently truncated or invalidated out from
underneath it.  Callers are supposed to prevent truncate races themselves,
however previously the only thing they can do in case they hit one is to
raise a SIGBUS.  A sigbus is wrong for the case that the page has been
invalidated or truncated within i_size (eg.  hole punched).  Callers may
also have to perform memory allocations in this path, where again, SIGBUS
would be wrong.

The previous patch ("mm: page_mkwrite change prototype to match fault")
made it possible to properly specify errors.  Convert the generic buffer.c
code and btrfs to return sane error values (in the case of page removed
from pagecache, VM_FAULT_NOPAGE will cause the fault handler to exit
without doing anything, and the fault will be retried properly).

This fixes core code, and converts btrfs as a template/example.  All other
filesystems defining their own page_mkwrite should be fixed in a similar
manner.

Acked-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 fs/buffer.c |   12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2409,7 +2409,7 @@ block_page_mkwrite(struct vm_area_struct
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	unsigned long end;
 	loff_t size;
-	int ret = -EINVAL;
+	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */

 	lock_page(page);
 	size = i_size_read(inode);
@@ -2429,10 +2429,14 @@ block_page_mkwrite(struct vm_area_struct
 	if (!ret)
 		ret = block_commit_write(page, 0, end);

-out_unlock:
-	if (ret)
-		ret = VM_FAULT_SIGBUS;
+	if (unlikely(ret)) {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else /* -ENOSPC, -EIO, etc */
+			ret = VM_FAULT_SIGBUS;
+	}

+out_unlock:
 	unlock_page(page);
 	return ret;
 }

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [patch 11/28] mm: close page_mkwrite races
       [not found] ` <20090514225413.GA705@kroah.com>
  2009-05-14 22:51   ` [patch 09/28] mm: page_mkwrite change prototype to match fault Greg KH
  2009-05-14 22:51   ` [patch 10/28] fs: fix page_mkwrite error cases in core code and btrfs Greg KH
@ 2009-05-14 22:51   ` Greg KH
  2009-05-14 22:51   ` [patch 12/28] GFS2: Fix page_mkwrite() return code Greg KH
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 6+ messages in thread
From: Greg KH @ 2009-05-14 22:51 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Justin Forbes, Zwane Mwaikambo, Theodore Ts'o, Randy Dunlap,
	Dave Jones, Chuck Wolber, Chris Wedgwood, Michael Krufky,
	Chuck Ebbert, Domenico Andreoli, Willy Tarreau,
	Rodrigo Rubira Branco, Jake Edge, Eugene Teo, torvalds, akpm,
	alan, linux-fsdevel, linux-mm, Sage Weil, Trond Myklebust,
	Nick Piggin, Valdis Kletnieks

[-- Attachment #1: mm-close-page_mkwrite-races.patch --]
[-- Type: text/plain, Size: 10276 bytes --]


2.6.27-stable review patch.  If anyone has any objections, please let us know.

------------------

From: Nick Piggin <npiggin@suse.de>

commit b827e496c893de0c0f142abfaeb8730a2fd6b37f upstream

mm: close page_mkwrite races

Change page_mkwrite to allow implementations to return with the page
locked, and also change it's callers (in page fault paths) to hold the
lock until the page is marked dirty.  This allows the filesystem to have
full control of page dirtying events coming from the VM.

Rather than simply hold the page locked over the page_mkwrite call, we
call page_mkwrite with the page unlocked and allow callers to return with
it locked, so filesystems can avoid LOR conditions with page lock.

The problem with the current scheme is this: a filesystem that wants to
associate some metadata with a page as long as the page is dirty, will
perform this manipulation in its ->page_mkwrite.  It currently then must
return with the page unlocked and may not hold any other locks (according
to existing page_mkwrite convention).

In this window, the VM could write out the page, clearing page-dirty.  The
filesystem has no good way to detect that a dirty pte is about to be
attached, so it will happily write out the page, at which point, the
filesystem may manipulate the metadata to reflect that the page is no
longer dirty.

It is not always possible to perform the required metadata manipulation in
->set_page_dirty, because that function cannot block or fail.  The
filesystem may need to allocate some data structure, for example.

And the VM cannot mark the pte dirty before page_mkwrite, because
page_mkwrite is allowed to fail, so we must not allow any window where the
page could be written to if page_mkwrite does fail.

This solution of holding the page locked over the 3 critical operations
(page_mkwrite, setting the pte dirty, and finally setting the page dirty)
closes out races nicely, preventing page cleaning for writeout being
initiated in that window.  This provides the filesystem with a strong
synchronisation against the VM here.

- Sage needs this race closed for ceph filesystem.
- Trond for NFS (http://bugzilla.kernel.org/show_bug.cgi?id=12913).
- I need it for fsblock.
- I suspect other filesystems may need it too (eg. btrfs).
- I have converted buffer.c to the new locking. Even simple block allocation
  under dirty pages might be susceptible to i_size changing under partial page
  at the end of file (we also have a buffer.c-side problem here, but it cannot
  be fixed properly without this patch).
- Other filesystems (eg. NFS, maybe btrfs) will need to change their
  page_mkwrite functions themselves.

[ This also moves page_mkwrite another step closer to fault, which should
  eventually allow page_mkwrite to be moved into ->fault, and thus avoiding a
  filesystem calldown and page lock/unlock cycle in __do_fault. ]

[akpm@linux-foundation.org: fix derefs of NULL ->mapping]
Cc: Sage Weil <sage@newdream.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>


---
 Documentation/filesystems/Locking |   24 +++++---
 fs/buffer.c                       |   10 ++-
 mm/memory.c                       |  108 ++++++++++++++++++++++++++------------
 3 files changed, 98 insertions(+), 44 deletions(-)

--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -509,16 +509,24 @@ locking rules:
 		BKL	mmap_sem	PageLocked(page)
 open:		no	yes
 close:		no	yes
-fault:		no	yes
-page_mkwrite:	no	yes		no
+fault:		no	yes		can return with page locked
+page_mkwrite:	no	yes		can return with page locked
 access:		no	yes
 
-	->page_mkwrite() is called when a previously read-only page is
-about to become writeable. The file system is responsible for
-protecting against truncate races. Once appropriate action has been
-taking to lock out truncate, the page range should be verified to be
-within i_size. The page mapping should also be checked that it is not
-NULL.
+	->fault() is called when a previously not present pte is about
+to be faulted in. The filesystem must find and return the page associated
+with the passed in "pgoff" in the vm_fault structure. If it is possible that
+the page may be truncated and/or invalidated, then the filesystem must lock
+the page, then ensure it is not already truncated (the page lock will block
+subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
+locked. The VM will unlock the page.
+
+	->page_mkwrite() is called when a previously read-only pte is
+about to become writeable. The filesystem again must ensure that there are
+no truncate/invalidate races, and then return with the page locked. If
+the page has been truncated, the filesystem should not look up a new page
+like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
+will cause the VM to retry the fault.
 
 	->access() is called when get_user_pages() fails in
 acces_process_vm(), typically used to debug a process through
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2416,7 +2416,8 @@ block_page_mkwrite(struct vm_area_struct
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_offset(page) > size)) {
 		/* page got truncated out from underneath us */
-		goto out_unlock;
+		unlock_page(page);
+		goto out;
 	}
 
 	/* page is wholly or partially inside EOF */
@@ -2430,14 +2431,15 @@ block_page_mkwrite(struct vm_area_struct
 		ret = block_commit_write(page, 0, end);
 
 	if (unlikely(ret)) {
+		unlock_page(page);
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
 		else /* -ENOSPC, -EIO, etc */
 			ret = VM_FAULT_SIGBUS;
-	}
+	} else
+		ret = VM_FAULT_LOCKED;
 
-out_unlock:
-	unlock_page(page);
+out:
 	return ret;
 }
 
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1827,6 +1827,15 @@ static int do_wp_page(struct mm_struct *
 				ret = tmp;
 				goto unwritable_page;
 			}
+			if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+				lock_page(old_page);
+				if (!old_page->mapping) {
+					ret = 0; /* retry the fault */
+					unlock_page(old_page);
+					goto unwritable_page;
+				}
+			} else
+				VM_BUG_ON(!PageLocked(old_page));
 
 			/*
 			 * Since we dropped the lock we need to revalidate
@@ -1836,9 +1845,11 @@ static int do_wp_page(struct mm_struct *
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
-			page_cache_release(old_page);
-			if (!pte_same(*page_table, orig_pte))
+			if (!pte_same(*page_table, orig_pte)) {
+				unlock_page(old_page);
+				page_cache_release(old_page);
 				goto unlock;
+			}
 
 			page_mkwrite = 1;
 		}
@@ -1943,9 +1954,6 @@ gotten:
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	if (dirty_page) {
-		if (vma->vm_file)
-			file_update_time(vma->vm_file);
-
 		/*
 		 * Yes, Virginia, this is actually required to prevent a race
 		 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -1954,16 +1962,41 @@ unlock:
 		 *
 		 * do_no_page is protected similarly.
 		 */
-		wait_on_page_locked(dirty_page);
-		set_page_dirty_balance(dirty_page, page_mkwrite);
+		if (!page_mkwrite) {
+			wait_on_page_locked(dirty_page);
+			set_page_dirty_balance(dirty_page, page_mkwrite);
+		}
 		put_page(dirty_page);
+		if (page_mkwrite) {
+			struct address_space *mapping = dirty_page->mapping;
+
+			set_page_dirty(dirty_page);
+			unlock_page(dirty_page);
+			page_cache_release(dirty_page);
+			if (mapping)	{
+				/*
+				 * Some device drivers do not set page.mapping
+				 * but still dirty their pages
+				 */
+				balance_dirty_pages_ratelimited(mapping);
+			}
+		}
+
+		/* file_update_time outside page_lock */
+		if (vma->vm_file)
+			file_update_time(vma->vm_file);
 	}
 	return ret;
 oom_free_new:
 	page_cache_release(new_page);
 oom:
-	if (old_page)
+	if (old_page) {
+		if (page_mkwrite) {
+			unlock_page(old_page);
+			page_cache_release(old_page);
+		}
 		page_cache_release(old_page);
+	}
 	return VM_FAULT_OOM;
 
 unwritable_page:
@@ -2488,27 +2521,22 @@ static int __do_fault(struct mm_struct *
 				int tmp;
 
 				unlock_page(page);
-				vmf.flags |= FAULT_FLAG_MKWRITE;
+				vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
 				tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
 				if (unlikely(tmp &
 					  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
 					ret = tmp;
-					anon = 1; /* no anon but release vmf.page */
-					goto out_unlocked;
-				}
-				lock_page(page);
-				/*
-				 * XXX: this is not quite right (racy vs
-				 * invalidate) to unlock and relock the page
-				 * like this, however a better fix requires
-				 * reworking page_mkwrite locking API, which
-				 * is better done later.
-				 */
-				if (!page->mapping) {
-					ret = 0;
-					anon = 1; /* no anon but release vmf.page */
-					goto out;
+					goto unwritable_page;
 				}
+				if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+					lock_page(page);
+					if (!page->mapping) {
+						ret = 0; /* retry the fault */
+						unlock_page(page);
+						goto unwritable_page;
+					}
+				} else
+					VM_BUG_ON(!PageLocked(page));
 				page_mkwrite = 1;
 			}
 		}
@@ -2565,19 +2593,35 @@ static int __do_fault(struct mm_struct *
 	pte_unmap_unlock(page_table, ptl);
 
 out:
-	unlock_page(vmf.page);
-out_unlocked:
-	if (anon)
-		page_cache_release(vmf.page);
-	else if (dirty_page) {
-		if (vma->vm_file)
-			file_update_time(vma->vm_file);
+	if (dirty_page) {
+		struct address_space *mapping = page->mapping;
 
-		set_page_dirty_balance(dirty_page, page_mkwrite);
+		if (set_page_dirty(dirty_page))
+			page_mkwrite = 1;
+		unlock_page(dirty_page);
 		put_page(dirty_page);
+		if (page_mkwrite && mapping) {
+			/*
+			 * Some device drivers do not set page.mapping but still
+			 * dirty their pages
+			 */
+			balance_dirty_pages_ratelimited(mapping);
+		}
+
+		/* file_update_time outside page_lock */
+		if (vma->vm_file)
+			file_update_time(vma->vm_file);
+	} else {
+		unlock_page(vmf.page);
+		if (anon)
+			page_cache_release(vmf.page);
 	}
 
 	return ret;
+
+unwritable_page:
+	page_cache_release(page);
+	return ret;
 }
 
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [patch 12/28] GFS2: Fix page_mkwrite() return code
       [not found] ` <20090514225413.GA705@kroah.com>
                     ` (2 preceding siblings ...)
  2009-05-14 22:51   ` [patch 11/28] mm: close page_mkwrite races Greg KH
@ 2009-05-14 22:51   ` Greg KH
  2009-05-14 22:51   ` [patch 13/28] NFS: Fix the return value in nfs_page_mkwrite() Greg KH
  2009-05-14 22:51   ` [patch 14/28] NFS: Close page_mkwrite() races Greg KH
  5 siblings, 0 replies; 6+ messages in thread
From: Greg KH @ 2009-05-14 22:51 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Justin Forbes, Zwane Mwaikambo, Theodore Ts'o, Randy Dunlap,
	Dave Jones, Chuck Wolber, Chris Wedgwood, Michael Krufky,
	Chuck Ebbert, Domenico Andreoli, Willy Tarreau,
	Rodrigo Rubira Branco, Jake Edge, Eugene Teo, torvalds, akpm,
	alan, linux-fsdevel, linux-mm, Steven Whitehouse, Nick Piggin

[-- Attachment #1: gfs2-fix-page_mkwrite-return-code.patch --]
[-- Type: text/plain, Size: 821 bytes --]


2.6.27-stable review patch.  If anyone has any objections, please let us know.

------------------

From: Steven Whitehouse <swhiteho@redhat.com>

commit e56985da455b9dc0591b8cb2006cc94b6f4fb0f4 upstream

This allows for the possibility of returning VM_FAULT_OOM as
well as VM_FAULT_SIGBUS. This ensures that the correct action
is taken.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>


---
 fs/gfs2/ops_file.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -412,7 +412,9 @@ out_unlock:
 	gfs2_glock_dq(&gh);
 out:
 	gfs2_holder_uninit(&gh);
-	if (ret)
+	if (ret == -ENOMEM)
+		ret = VM_FAULT_OOM;
+	else if (ret)
 		ret = VM_FAULT_SIGBUS;
 	return ret;
 }



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [patch 13/28] NFS: Fix the return value in nfs_page_mkwrite()
       [not found] ` <20090514225413.GA705@kroah.com>
                     ` (3 preceding siblings ...)
  2009-05-14 22:51   ` [patch 12/28] GFS2: Fix page_mkwrite() return code Greg KH
@ 2009-05-14 22:51   ` Greg KH
  2009-05-14 22:51   ` [patch 14/28] NFS: Close page_mkwrite() races Greg KH
  5 siblings, 0 replies; 6+ messages in thread
From: Greg KH @ 2009-05-14 22:51 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Justin Forbes, Zwane Mwaikambo, Theodore Ts'o, Randy Dunlap,
	Dave Jones, Chuck Wolber, Chris Wedgwood, Michael Krufky,
	Chuck Ebbert, Domenico Andreoli, Willy Tarreau,
	Rodrigo Rubira Branco, Jake Edge, Eugene Teo, torvalds, akpm,
	alan, linux-fsdevel, linux-mm, Trond Myklebust, Nick Piggin

[-- Attachment #1: nfs-fix-the-return-value-in-nfs_page_mkwrite.patch --]
[-- Type: text/plain, Size: 931 bytes --]

2.6.27-stable review patch.  If anyone has any objections, please let us know.

------------------

From: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 2b2ec7554cf7ec5e4412f89a5af6abe8ce950700 upstream

Commit c2ec175c39f62949438354f603f4aa170846aabb ("mm: page_mkwrite
change prototype to match fault") exposed a bug in the NFS
implementation of page_mkwrite.  We should be returning 0 on success...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

---
 fs/nfs/file.c |    2 --
 1 file changed, 2 deletions(-)

--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -477,8 +477,6 @@ static int nfs_vm_page_mkwrite(struct vm
 		goto out_unlock;
 
 	ret = nfs_updatepage(filp, page, 0, pagelen);
-	if (ret == 0)
-		ret = pagelen;
 out_unlock:
 	unlock_page(page);
 	if (ret)



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [patch 14/28] NFS: Close page_mkwrite() races
       [not found] ` <20090514225413.GA705@kroah.com>
                     ` (4 preceding siblings ...)
  2009-05-14 22:51   ` [patch 13/28] NFS: Fix the return value in nfs_page_mkwrite() Greg KH
@ 2009-05-14 22:51   ` Greg KH
  5 siblings, 0 replies; 6+ messages in thread
From: Greg KH @ 2009-05-14 22:51 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Justin Forbes, Zwane Mwaikambo, Theodore Ts'o, Randy Dunlap,
	Dave Jones, Chuck Wolber, Chris Wedgwood, Michael Krufky,
	Chuck Ebbert, Domenico Andreoli, Willy Tarreau,
	Rodrigo Rubira Branco, Jake Edge, Eugene Teo, torvalds, akpm,
	alan, linux-fsdevel, linux-mm, Trond Myklebust, Nick Piggin

[-- Attachment #1: nfs-close-page_mkwrite-races.patch --]
[-- Type: text/plain, Size: 1062 bytes --]


2.6.27-stable review patch.  If anyone has any objections, please let us know.

------------------

From: Trond Myklebust <Trond.Myklebust@netapp.com>

commit 7fdf523067666b0eaff330f362401ee50ce187c4 upstream

Follow up to Nick Piggin's patches to ensure that nfs_vm_page_mkwrite
returns with the page lock held, and sets the VM_FAULT_LOCKED flag.

See http://bugzilla.kernel.org/show_bug.cgi?id=12913

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>


---
 fs/nfs/file.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -478,10 +478,10 @@ static int nfs_vm_page_mkwrite(struct vm
 
 	ret = nfs_updatepage(filp, page, 0, pagelen);
 out_unlock:
+	if (!ret)
+		return VM_FAULT_LOCKED;
 	unlock_page(page);
-	if (ret)
-		ret = VM_FAULT_SIGBUS;
-	return ret;
+	return VM_FAULT_SIGBUS;
 }
 
 static struct vm_operations_struct nfs_file_vm_ops = {



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2009-05-14 22:58 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20090514225126.907908936@mini.kroah.org>
     [not found] ` <20090514225413.GA705@kroah.com>
2009-05-14 22:51   ` [patch 09/28] mm: page_mkwrite change prototype to match fault Greg KH
2009-05-14 22:51   ` [patch 10/28] fs: fix page_mkwrite error cases in core code and btrfs Greg KH
2009-05-14 22:51   ` [patch 11/28] mm: close page_mkwrite races Greg KH
2009-05-14 22:51   ` [patch 12/28] GFS2: Fix page_mkwrite() return code Greg KH
2009-05-14 22:51   ` [patch 13/28] NFS: Fix the return value in nfs_page_mkwrite() Greg KH
2009-05-14 22:51   ` [patch 14/28] NFS: Close page_mkwrite() races Greg KH

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).