linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Nick Piggin <npiggin@suse.de>
To: linux-fsdevel@vger.kernel.org
Cc: hch@infradead.org, viro@zeniv.linux.org.uk, jack@suse.cz,
	linux-mm@kvack.org
Subject: [patch 2/3] fs: buffer_head writepage no zero
Date: Fri, 10 Jul 2009 11:34:03 +0200	[thread overview]
Message-ID: <20090710093403.GH14666@wotan.suse.de> (raw)
In-Reply-To: <20090710093325.GG14666@wotan.suse.de>


When writing a page to filesystem, buffer.c zeroes out parts of the page past
i_size in an attempt to get zeroes into those blocks on disk, so as to honour
the requirement that an expanding truncate should zero-fill the file.

Unfortunately, this is racy. The reason we can get something other than
zeroes here is via an mmaped write to the block beyond i_size. Zeroing it
out before writepage narrows the window, but it is still possible to store
junk beyond i_size on disk, by storing into the page after writepage zeroes,
but before DMA (or copy) completes. This allows process A to break posix
semantics for process B (or even inadvertently for itsef).

It could also be possible that the filesystem has written data into the
block but not yet expanded the inode size when the system crashes for
some reason. Unless its journal reply / fsck process etc checks for this
condition, it could also cause subsequent breakage in semantics.

---
 fs/buffer.c      |   94 +++++++++++++++++++++++++------------------------------
 fs/ext2/inode.c  |   30 ++++++++++++++++-
 fs/mpage.c       |   13 +------
 mm/filemap_xip.c |   15 ++++----
 mm/truncate.c    |    1 
 5 files changed, 82 insertions(+), 71 deletions(-)

Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -2656,26 +2656,14 @@ int nobh_writepage(struct page *page, ge
 	unsigned offset;
 	int ret;
 
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		goto out;
-
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index >= end_index+1 || !offset) {
+	if (page->index >= end_index &&
+			(page->index >= end_index+1 || !offset)) {
 		unlock_page(page);
 		return 0;
 	}
 
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invocation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
 		ret = __block_write_full_page(inode, page, get_block, wbc,
@@ -2695,22 +2683,23 @@ int nobh_truncate_page(struct address_sp
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head map_bh;
-	int err;
+	int err = 0;
 
 	blocksize = 1 << inode->i_blkbits;
 	length = offset & (blocksize - 1);
 
 	/* Block boundary? Nothing to do */
 	if (!length)
-		return 0;
+		goto out;
 
 	length = blocksize - length;
 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
 	page = grab_cache_page(mapping, index);
-	err = -ENOMEM;
-	if (!page)
+	if (!page) {
+		err = -ENOMEM;
 		goto out;
+	}
 
 	if (page_has_buffers(page)) {
 has_buffers:
@@ -2752,7 +2741,6 @@ has_buffers:
 	}
 	zero_user(page, offset, length);
 	set_page_dirty(page);
-	err = 0;
 
 unlock:
 	unlock_page(page);
@@ -2762,8 +2750,8 @@ out:
 }
 EXPORT_SYMBOL(nobh_truncate_page);
 
-int block_truncate_page(struct address_space *mapping,
-			loff_t from, get_block_t *get_block)
+int __block_truncate_page(struct address_space *mapping,
+			loff_t from, loff_t to, get_block_t *get_block)
 {
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -2773,22 +2761,22 @@ int block_truncate_page(struct address_s
 	struct inode *inode = mapping->host;
 	struct page *page;
 	struct buffer_head *bh;
-	int err;
+	int err = 0;
 
-	blocksize = 1 << inode->i_blkbits;
-	length = offset & (blocksize - 1);
+	/* Page boundary? Nothing to do */
+	if (!offset)
+		goto out;
 
-	/* Block boundary? Nothing to do */
-	if (!length)
-		return 0;
+	blocksize = 1 << inode->i_blkbits;
 
-	length = blocksize - length;
+	length = (unsigned)min_t(loff_t, to - from, PAGE_CACHE_SIZE - offset);
 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	
 	page = grab_cache_page(mapping, index);
-	err = -ENOMEM;
-	if (!page)
+	if (!page) {
+		err = -ENOMEM;
 		goto out;
+	}
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
@@ -2802,15 +2790,20 @@ int block_truncate_page(struct address_s
 		pos += blocksize;
 	}
 
-	err = 0;
 	if (!buffer_mapped(bh)) {
 		WARN_ON(bh->b_size != blocksize);
 		err = get_block(inode, iblock, bh, 0);
 		if (err)
 			goto unlock;
-		/* unmapped? It's a hole - nothing to do */
-		if (!buffer_mapped(bh))
+		/*
+		 * unmapped? It's a hole - must zero out partial
+		 * in the case of an extending truncate where mmap has
+		 * previously written past i_size of the page
+		 */
+		if (!buffer_mapped(bh)) {
+			zero_user(page, offset, length);
 			goto unlock;
+		}
 	}
 
 	/* Ok, it's mapped. Make sure it's up-to-date */
@@ -2818,17 +2811,17 @@ int block_truncate_page(struct address_s
 		set_buffer_uptodate(bh);
 
 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
-		err = -EIO;
 		ll_rw_block(READ, 1, &bh);
 		wait_on_buffer(bh);
 		/* Uhhuh. Read error. Complain and punt. */
-		if (!buffer_uptodate(bh))
+		if (!buffer_uptodate(bh)) {
+			err = -EIO;
 			goto unlock;
+		}
 	}
 
 	zero_user(page, offset, length);
 	mark_buffer_dirty(bh);
-	err = 0;
 
 unlock:
 	unlock_page(page);
@@ -2836,6 +2829,19 @@ unlock:
 out:
 	return err;
 }
+EXPORT_SYMBOL(__block_truncate_page);
+
+int block_truncate_page(struct address_space *mapping,
+			loff_t from, get_block_t *get_block)
+{
+	struct inode *inode = mapping->host;
+	int err = 0;
+
+	if (from > inode->i_size)
+		err = __block_truncate_page(mapping, inode->i_size, from, get_block);
+
+	return err;
+}
 
 /*
  * The generic ->writepage function for buffer-backed address_spaces
@@ -2849,26 +2855,14 @@ int block_write_full_page_endio(struct p
 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 	unsigned offset;
 
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		return __block_write_full_page(inode, page, get_block, wbc,
-					       handler);
-
 	/* Is the page fully outside i_size? (truncate in progress) */
 	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index >= end_index+1 || !offset) {
+	if (page->index >= end_index &&
+			(page->index >= end_index+1 || !offset)) {
 		unlock_page(page);
 		return 0;
 	}
 
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invokation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 	return __block_write_full_page(inode, page, get_block, wbc, handler);
 }
 
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c
+++ linux-2.6/mm/filemap_xip.c
@@ -440,22 +440,23 @@ EXPORT_SYMBOL_GPL(xip_file_write);
 int
 xip_truncate_page(struct address_space *mapping, loff_t from)
 {
+	struct inode *inode = mapping->host;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize;
 	unsigned length;
 	void *xip_mem;
 	unsigned long xip_pfn;
-	int err;
+	int err = 0;
 
 	BUG_ON(!mapping->a_ops->get_xip_mem);
 
-	blocksize = 1 << mapping->host->i_blkbits;
+	blocksize = 1 << inode->i_blkbits;
 	length = offset & (blocksize - 1);
 
 	/* Block boundary? Nothing to do */
 	if (!length)
-		return 0;
+		goto out;
 
 	length = blocksize - length;
 
@@ -464,11 +465,11 @@ xip_truncate_page(struct address_space *
 	if (unlikely(err)) {
 		if (err == -ENODATA)
 			/* Hole? No need to truncate */
-			return 0;
-		else
-			return err;
+			err = 0;
+		goto out;
 	}
 	memset(xip_mem + offset, 0, length);
-	return 0;
+out:
+	return err;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
Index: linux-2.6/fs/mpage.c
===================================================================
--- linux-2.6.orig/fs/mpage.c
+++ linux-2.6/fs/mpage.c
@@ -559,19 +559,10 @@ static int __mpage_writepage(struct page
 page_is_mapped:
 	end_index = i_size >> PAGE_CACHE_SHIFT;
 	if (page->index >= end_index) {
-		/*
-		 * The page straddles i_size.  It must be zeroed out on each
-		 * and every writepage invokation because it may be mmapped.
-		 * "A file is mapped in multiples of the page size.  For a file
-		 * that is not a multiple of the page size, the remaining memory
-		 * is zeroed when mapped, and writes to that region are not
-		 * written out to the file."
-		 */
 		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
 
-		if (page->index > end_index || !offset)
-			goto confused;
-		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+		if (page->index >= end_index+1 || !offset)
+			goto confused; /* page fully outside i_size */
 	}
 
 	/*
Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c
+++ linux-2.6/fs/ext2/inode.c
@@ -777,14 +777,40 @@ ext2_write_begin(struct file *file, stru
 	return ret;
 }
 
+int __block_truncate_page(struct address_space *mapping,
+			loff_t from, loff_t to, get_block_t *get_block);
 static int ext2_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
+	struct inode *inode = mapping->host;
 	int ret;
 
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	if (ret < len) {
+	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+	unlock_page(page);
+	page_cache_release(page);
+        if (pos+copied > inode->i_size) {
+		int err;
+                if (pos > inode->i_size) {
+                        /* expanding a hole */
+			err = __block_truncate_page(mapping, inode->i_size,
+						pos, ext2_get_block);
+			if (err) {
+				ret = err;
+				goto out;
+			}
+			err = __block_truncate_page(mapping, pos+copied,
+						LLONG_MAX, ext2_get_block);
+			if (err) {
+				ret = err;
+				goto out;
+			}
+                }
+                i_size_write(inode, pos+copied);
+                mark_inode_dirty(inode);
+        }
+out:
+	if (ret < 0 || ret < len) {
 		struct inode *inode = mapping->host;
 		loff_t isize = inode->i_size;
 		if (pos + len > isize)
Index: linux-2.6/mm/truncate.c
===================================================================
--- linux-2.6.orig/mm/truncate.c
+++ linux-2.6/mm/truncate.c
@@ -49,7 +49,6 @@ void do_invalidatepage(struct page *page
 
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
-	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
 	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }

  reply	other threads:[~2009-07-10  9:34 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-07-10  7:30 [patch 0/5] new truncate sequence patchset npiggin
2009-07-10  7:30 ` [patch 1/5] fs: new truncate helpers npiggin
2009-07-10  7:30 ` [patch 2/5] fs: use " npiggin-l3A5Bk7waGM
2009-07-10  7:30 ` [patch 3/5] fs: introduce new truncate sequence npiggin
2009-07-10  7:30 ` [patch 4/5] tmpfs: convert to use the new truncate convention npiggin
2009-07-10  7:30 ` [patch 5/5] ext2: " npiggin
2009-09-01 18:29   ` Jan Kara
2009-09-02  9:14     ` Nick Piggin
2009-09-02 11:14       ` Jan Kara
2009-07-10  9:33 ` [patch 1/3] fs: buffer_head writepage no invalidate Nick Piggin
2009-07-10  9:34   ` Nick Piggin [this message]
2009-07-10 11:46     ` [patch 2/3] fs: buffer_head writepage no zero Jan Kara
2009-07-13  6:54       ` Nick Piggin
2009-07-10  9:35   ` [patch 3/3] fs: buffer_head page_lock i_size relax Nick Piggin
2009-07-10 11:08   ` [patch 1/3] fs: buffer_head writepage no invalidate Jan Kara
2009-07-10 14:31 ` [patch 0/5] new truncate sequence patchset Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090710093403.GH14666@wotan.suse.de \
    --to=npiggin@suse.de \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).