[PATCH mmotm] nilfs2: insert checks and hole block allocation in page

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite
@ 2009-01-07 13:06 Ryusuke Konishi
  2009-01-07 14:43 ` Chris Mason
  0 siblings, 1 reply; 5+ messages in thread
From: Ryusuke Konishi @ 2009-01-07 13:06 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel, linux-kernel, Ryusuke Konishi, Chris Mason

Chris Mason told me that page_mkwrite() has some works to do.

On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> nilfs_page_mkwrite doesn't seem to dirty the page?  block_page_mkwrite
> does more, including checks against i_size and others.

This will insert i_size check, and hole block allocation code in the
nilfs_page_mkwrite.

Previously, the hole block allocation was delayed until just before
writing for mmapped pages.  This accompanies removal of the delayed
allocation code.

Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/file.c    |   59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nilfs2/segment.c |   42 ++++--------------------------------
 2 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7ddd42e..146dc23 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -75,11 +75,66 @@ nilfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-	if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
-		return -EPERM;
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct nilfs_transaction_info ti;
+	struct buffer_head *bh, *head;
+	int fully_mapped = 1;
+	int ret = -EINVAL;
+
+	/*
+	 * use i_alloc_sem to stop truncate operations on the inode
+	 */
+	down_read(&inode->i_alloc_sem);
+
+	if (page->mapping != mapping ||
+	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page))
+		goto failed;
+
+	if (PageMappedToDisk(page))
+		goto mapped;
+
+	spin_lock(&mapping->private_lock); /* exclude try_to_free_buffers() */
+	if (page_has_buffers(page)) {
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_mapped(bh)) {
+				fully_mapped = 0;
+				break;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (fully_mapped) {
+			spin_unlock(&mapping->private_lock);
+			SetPageMappedToDisk(page);
+			goto mapped;
+		}
+	}
+	spin_unlock(&mapping->private_lock);
+
+	/*
+	 * Fill hole blocks. This accompanies a disk capacity check.
+	 */
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+	if (unlikely(ret))
+		goto failed;
+
+	ret = block_page_mkwrite(vma, page, nilfs_get_block);
+	if (unlikely(ret)) {
+		nilfs_transaction_abort(inode->i_sb);
+		goto failed;
+	}
+	nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
+	up_read(&inode->i_alloc_sem);
 	SetPageChecked(page);
 	wait_on_page_writeback(page);
 	return 0;
+
+ failed:
+	up_read(&inode->i_alloc_sem);
+	return ret;
 }
 
 struct vm_operations_struct nilfs_file_vm_ops = {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 182fb26..b5d28d2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -667,42 +667,6 @@ struct nilfs_sc_operations nilfs_sc_dsync_ops = {
 	.write_node_binfo = NULL,
 };
 
-static int nilfs_prepare_data_page(struct inode *inode, struct page *page)
-{
-	int err = 0;
-
-	lock_page(page);
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
-	if (!PageMappedToDisk(page)) {
-		struct buffer_head *bh, *head;
-		sector_t blkoff
-			= page->index << (PAGE_SHIFT - inode->i_blkbits);
-
-		int non_mapped = 0;
-
-		bh = head = page_buffers(page);
-		do {
-			if (!buffer_mapped(bh)) {
-				if (!buffer_dirty(bh)) {
-					non_mapped++;
-					continue;
-				}
-				err = nilfs_get_block(inode, blkoff, bh, 1);
-				if (unlikely(err))
-					goto out_unlock;
-			}
-		} while (blkoff++, (bh = bh->b_this_page) != head);
-		if (!non_mapped)
-			SetPageMappedToDisk(page);
-	}
-
- out_unlock:
-	unlock_page(page);
-	return err;
-}
-
 static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
 					   struct list_head *listp,
 					   struct nilfs_sc_info *sci)
@@ -727,7 +691,11 @@ static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		struct page *page = pvec.pages[i];
 
 		if (mapping->host) {
-			err = nilfs_prepare_data_page(inode, page);
+			lock_page(page);
+			if (!page_has_buffers(page))
+				create_empty_buffers(page,
+						     1 << inode->i_blkbits, 0);
+			unlock_page(page);
 			if (unlikely(err))
 				break;
 		}
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite
  2009-01-07 13:06 [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite Ryusuke Konishi
@ 2009-01-07 14:43 ` Chris Mason
  2009-01-07 17:33   ` Ryusuke Konishi
  0 siblings, 1 reply; 5+ messages in thread
From: Chris Mason @ 2009-01-07 14:43 UTC (permalink / raw)
  To: Ryusuke Konishi; +Cc: Andrew Morton, linux-fsdevel, linux-kernel

On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> Chris Mason told me that page_mkwrite() has some works to do.
> 
> On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > nilfs_page_mkwrite doesn't seem to dirty the page?  block_page_mkwrite
> > does more, including checks against i_size and others.
> 
> This will insert i_size check, and hole block allocation code in the
> nilfs_page_mkwrite.
> 
> Previously, the hole block allocation was delayed until just before
> writing for mmapped pages.  This accompanies removal of the delayed
> allocation code.
> 
> Cc: Chris Mason <chris.mason@oracle.com>
> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
> ---
>  fs/nilfs2/file.c    |   59 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  fs/nilfs2/segment.c |   42 ++++--------------------------------
>  2 files changed, 62 insertions(+), 39 deletions(-)
> 
> diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
> index 7ddd42e..146dc23 100644
> --- a/fs/nilfs2/file.c
> +++ b/fs/nilfs2/file.c
> @@ -75,11 +75,66 @@ nilfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  
>  static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
>  {
> -	if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
> -		return -EPERM;
> +	struct inode *inode = vma->vm_file->f_dentry->d_inode;
> +	struct address_space *mapping = inode->i_mapping;
> +	struct nilfs_transaction_info ti;
> +	struct buffer_head *bh, *head;
> +	int fully_mapped = 1;
> +	int ret = -EINVAL;
> +
> +	/*
> +	 * use i_alloc_sem to stop truncate operations on the inode
> +	 */
> +	down_read(&inode->i_alloc_sem);

I'm not 100% sure this is safe.  It seems likely the direct io paths
could trigger a mkwrite with the i_alloc_sem already held?

If I'm reading the code correctly you're:

* grab i_alloc sem to protect against truncate
* check to see if the page is mapped already (no holes)

* if there are holes, lock the page, start transaction and do the full
block_page_mkwrite

* drop i_alloc_sem

Would this work instead?:

* lock the page, check to see if it is mapped

* if there are holes, drop the lock, start transaction and do the full
block_page_mkwrite

-chris



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite
  2009-01-07 14:43 ` Chris Mason
@ 2009-01-07 17:33   ` Ryusuke Konishi
  2009-01-07 17:47     ` Chris Mason
  0 siblings, 1 reply; 5+ messages in thread
From: Ryusuke Konishi @ 2009-01-07 17:33 UTC (permalink / raw)
  To: chris.mason; +Cc: konishi.ryusuke, akpm, linux-fsdevel, linux-kernel

On Wed, 07 Jan 2009 09:43:02 -0500, Chris Mason wrote:
> On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> > Chris Mason told me that page_mkwrite() has some works to do.
> > 
> > On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > > nilfs_page_mkwrite doesn't seem to dirty the page?  block_page_mkwrite
> > > does more, including checks against i_size and others.
> > 
> > This will insert i_size check, and hole block allocation code in the
> > nilfs_page_mkwrite.
> > 
> > Previously, the hole block allocation was delayed until just before
> > writing for mmapped pages.  This accompanies removal of the delayed
> > allocation code.
<snip>
> >  static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> >  {
> > -	if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
> > -		return -EPERM;
> > +	struct inode *inode = vma->vm_file->f_dentry->d_inode;
> > +	struct address_space *mapping = inode->i_mapping;
> > +	struct nilfs_transaction_info ti;
> > +	struct buffer_head *bh, *head;
> > +	int fully_mapped = 1;
> > +	int ret = -EINVAL;
> > +
> > +	/*
> > +	 * use i_alloc_sem to stop truncate operations on the inode
> > +	 */
> > +	down_read(&inode->i_alloc_sem);
> 
> I'm not 100% sure this is safe.  It seems likely the direct io paths
> could trigger a mkwrite with the i_alloc_sem already held?

vmtruncate() can change i_size outside the page lock, and it even
calls unmap_mapping_range(). Does it have any problems?

> If I'm reading the code correctly you're:
> 
> * grab i_alloc sem to protect against truncate
> * check to see if the page is mapped already (no holes)
> 
> * if there are holes, lock the page, start transaction and do the full
> block_page_mkwrite
> 
> * drop i_alloc_sem

Yes, it is.
 
> Would this work instead?:
> 
> * lock the page, check to see if it is mapped
> 
> * if there are holes, drop the lock, start transaction and do the full
> block_page_mkwrite
> 
> -chris

If we don't need care for vmtruncate(), I think it's better, too.

Regards,
Ryusuke

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite
  2009-01-07 17:33   ` Ryusuke Konishi
@ 2009-01-07 17:47     ` Chris Mason
  2009-01-08  7:59       ` Ryusuke Konishi
  0 siblings, 1 reply; 5+ messages in thread
From: Chris Mason @ 2009-01-07 17:47 UTC (permalink / raw)
  To: Ryusuke Konishi; +Cc: akpm, linux-fsdevel, linux-kernel

On Thu, 2009-01-08 at 02:33 +0900, Ryusuke Konishi wrote:
> On Wed, 07 Jan 2009 09:43:02 -0500, Chris Mason wrote:
> > On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> > > Chris Mason told me that page_mkwrite() has some works to do.
> > > 
> > > On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > > > nilfs_page_mkwrite doesn't seem to dirty the page?  block_page_mkwrite
> > > > does more, including checks against i_size and others.
> > > 
> > > This will insert i_size check, and hole block allocation code in the
> > > nilfs_page_mkwrite.
> > > 
> > > Previously, the hole block allocation was delayed until just before
> > > writing for mmapped pages.  This accompanies removal of the delayed
> > > allocation code.
> <snip>
> > >  static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
> > >  {
> > > -	if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
> > > -		return -EPERM;
> > > +	struct inode *inode = vma->vm_file->f_dentry->d_inode;
> > > +	struct address_space *mapping = inode->i_mapping;
> > > +	struct nilfs_transaction_info ti;
> > > +	struct buffer_head *bh, *head;
> > > +	int fully_mapped = 1;
> > > +	int ret = -EINVAL;
> > > +
> > > +	/*
> > > +	 * use i_alloc_sem to stop truncate operations on the inode
> > > +	 */
> > > +	down_read(&inode->i_alloc_sem);
> > 
> > I'm not 100% sure this is safe.  It seems likely the direct io paths
> > could trigger a mkwrite with the i_alloc_sem already held?
> 
> vmtruncate() can change i_size outside the page lock, and it even
> calls unmap_mapping_range(). Does it have any problems?
> 

You're right, i_size can change without you knowing about it, but before
vmtruncate does anything to the page itself, it locks the page.

So, we tend to do a somewhat racey check against i_size.  If the page is
outside i_size, we happily ignore it.  If it is inside i_size, we assume
it is part of the file.  Most of the code that does this checks i_size
once after locking the page and saves that value, so if i_size changes
later on the code at least does consistent operations based on a single
value of i_size.

If the page straddles i_size and someone extends the file, they are
expected to lock the page as well (see block_truncate_page and the
places it gets called).

-chris



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite
  2009-01-07 17:47     ` Chris Mason
@ 2009-01-08  7:59       ` Ryusuke Konishi
  0 siblings, 0 replies; 5+ messages in thread
From: Ryusuke Konishi @ 2009-01-08  7:59 UTC (permalink / raw)
  To: chris.mason, akpm; +Cc: konishi.ryusuke, linux-fsdevel, linux-kernel

On Wed, 07 Jan 2009 12:47:22 -0500, Chris Mason wrote:
> On Thu, 2009-01-08 at 02:33 +0900, Ryusuke Konishi wrote:
> > On Wed, 07 Jan 2009 09:43:02 -0500, Chris Mason wrote:
> > > On Wed, 2009-01-07 at 22:06 +0900, Ryusuke Konishi wrote:
> > > > Chris Mason told me that page_mkwrite() has some works to do.
> > > > 
> > > > On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> > > > > nilfs_page_mkwrite doesn't seem to dirty the page?  block_page_mkwrite
> > > > > does more, including checks against i_size and others.
> > > > 
> > > > This will insert i_size check, and hole block allocation code in the
> > > > nilfs_page_mkwrite.
> > > > 
> > > > Previously, the hole block allocation was delayed until just before
> > > > writing for mmapped pages.  This accompanies removal of the delayed
> > > > allocation code.
> > <snip>
> > > > +	/*
> > > > +	 * use i_alloc_sem to stop truncate operations on the inode
> > > > +	 */
> > > > +	down_read(&inode->i_alloc_sem);
> > > 
> > > I'm not 100% sure this is safe.  It seems likely the direct io paths
> > > could trigger a mkwrite with the i_alloc_sem already held?
> > 
> > vmtruncate() can change i_size outside the page lock, and it even
> > calls unmap_mapping_range(). Does it have any problems?
> 
> You're right, i_size can change without you knowing about it, but before
> vmtruncate does anything to the page itself, it locks the page.
> 
> So, we tend to do a somewhat racey check against i_size.  If the page is
> outside i_size, we happily ignore it.  If it is inside i_size, we assume
> it is part of the file.  Most of the code that does this checks i_size
> once after locking the page and saves that value, so if i_size changes
> later on the code at least does consistent operations based on a single
> value of i_size.
> 
> If the page straddles i_size and someone extends the file, they are
> expected to lock the page as well (see block_truncate_page and the
> places it gets called).
> 
> -chris

I got it, using page locking seems OK even for vmtruncate().

I don't know if the mkwrite is actually called from the direct io
path, but no doubt about i_alloc_sem is used in it.  To make this
safer and avoid confusion, I took your advice.

The revised patch is attached below.

Ryusuke
--
nilfs2: insert checks and hole block allocation in page_mkwrite (rev 2)

Chris Mason told me that page_mkwrite() has some works to do.

On Wed, 17 Dec 2008 21:52:55 -0500, Chris Mason wrote:
> nilfs_page_mkwrite doesn't seem to dirty the page?  block_page_mkwrite
> does more, including checks against i_size and others.

This will insert i_size check, a disk capacity check, and hole block
allocation code in the nilfs_page_mkwrite.

Previously, the hole block allocation was delayed until just before
writing for mmapped pages.  This accompanies removal of the delayed
allocation code.

This revision uses a page lock instead of i_alloc_sem for safety.

Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/file.c    |   58 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nilfs2/segment.c |   44 ++++----------------------------------
 2 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7ddd42e..c22f97c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -75,8 +75,62 @@ nilfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-	if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
-		return -EPERM;
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int ret;
+
+	if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+		return -ENOSPC;
+
+	lock_page(page);
+	if (page->mapping != inode->i_mapping ||
+	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+		unlock_page(page);
+		return -EINVAL;
+	}
+
+	/*
+	 * check to see if the page is mapped already (no holes)
+	 */
+	if (PageMappedToDisk(page)) {
+		unlock_page(page);
+		goto mapped;
+	}
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int fully_mapped = 1;
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_mapped(bh)) {
+				fully_mapped = 0;
+				break;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (fully_mapped) {
+			SetPageMappedToDisk(page);
+			unlock_page(page);
+			goto mapped;
+		}
+	}
+	unlock_page(page);
+
+	/*
+	 * fill hole blocks
+	 */
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+	if (unlikely(ret))
+		return ret;
+
+	ret = block_page_mkwrite(vma, page, nilfs_get_block);
+	if (unlikely(ret)) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
 	SetPageChecked(page);
 	wait_on_page_writeback(page);
 	return 0;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 182fb26..b4b96ac 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -667,42 +667,6 @@ struct nilfs_sc_operations nilfs_sc_dsync_ops = {
 	.write_node_binfo = NULL,
 };
 
-static int nilfs_prepare_data_page(struct inode *inode, struct page *page)
-{
-	int err = 0;
-
-	lock_page(page);
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
-	if (!PageMappedToDisk(page)) {
-		struct buffer_head *bh, *head;
-		sector_t blkoff
-			= page->index << (PAGE_SHIFT - inode->i_blkbits);
-
-		int non_mapped = 0;
-
-		bh = head = page_buffers(page);
-		do {
-			if (!buffer_mapped(bh)) {
-				if (!buffer_dirty(bh)) {
-					non_mapped++;
-					continue;
-				}
-				err = nilfs_get_block(inode, blkoff, bh, 1);
-				if (unlikely(err))
-					goto out_unlock;
-			}
-		} while (blkoff++, (bh = bh->b_this_page) != head);
-		if (!non_mapped)
-			SetPageMappedToDisk(page);
-	}
-
- out_unlock:
-	unlock_page(page);
-	return err;
-}
-
 static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
 					   struct list_head *listp,
 					   struct nilfs_sc_info *sci)
@@ -727,9 +691,11 @@ static int nilfs_lookup_dirty_data_buffers(struct inode *inode,
 		struct page *page = pvec.pages[i];
 
 		if (mapping->host) {
-			err = nilfs_prepare_data_page(inode, page);
-			if (unlikely(err))
-				break;
+			lock_page(page);
+			if (!page_has_buffers(page))
+				create_empty_buffers(page,
+						     1 << inode->i_blkbits, 0);
+			unlock_page(page);
 		}
 
 		bh = head = page_buffers(page);
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2009-01-08  8:04 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-07 13:06 [PATCH mmotm] nilfs2: insert checks and hole block allocation in page_mkwrite Ryusuke Konishi
2009-01-07 14:43 ` Chris Mason
2009-01-07 17:33   ` Ryusuke Konishi
2009-01-07 17:47     ` Chris Mason
2009-01-08  7:59       ` Ryusuke Konishi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).