[patch 1/5] fs: add an iovec iterator

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 1/5] fs: add an iovec iterator
@ 2007-03-14 13:38 Nick Piggin
  2007-03-14 13:38 ` [patch 2/5] fs: introduce new aops and infrastructure Nick Piggin
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-14 13:38 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Christoph Hellwig, Nick Piggin, Andrew Morton,
	Mark Fasheh

Add an iterator data structure to operate over an iovec. Add usercopy
operators needed by generic_file_buffered_write, and convert that function
over.

 include/linux/fs.h |   33 ++++++++++++
 mm/filemap.c       |  144 +++++++++++++++++++++++++++++++++++++++++++----------
 mm/filemap.h       |  103 -------------------------------------
 3 files changed, 150 insertions(+), 130 deletions(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -396,6 +396,39 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+struct iov_iter {
+	const struct iovec *iov;
+	unsigned long nr_segs;
+	size_t iov_offset;
+	size_t count;
+};
+
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes);
+size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes);
+void iov_iter_advance(struct iov_iter *i, size_t bytes);
+int iov_iter_fault_in_readable(struct iov_iter *i);
+size_t iov_iter_single_seg_count(struct iov_iter *i);
+
+static inline void iov_iter_init(struct iov_iter *i,
+			const struct iovec *iov, unsigned long nr_segs,
+			size_t count, size_t written)
+{
+	i->iov = iov;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+
+static inline size_t iov_iter_count(struct iov_iter *i)
+{
+	return i->count;
+}
+
+
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
 	int (*readpage)(struct file *, struct page *);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -30,7 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
-#include "filemap.h"
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include "internal.h"
 
 /*
@@ -1839,8 +1839,7 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
 
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 			const struct iovec *iov, size_t base, size_t bytes)
 {
 	size_t copied = 0, left = 0;
@@ -1863,6 +1862,110 @@ __filemap_copy_from_user_iovec_inatomic(
 }
 
 /*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		left = __copy_from_user_inatomic_nocache(kaddr + offset,
+							buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						i->iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = i->iov->iov_base + i->iov_offset;
+		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						i->iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
+{
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+	} else {
+		const struct iovec *iov = i->iov;
+		size_t base = i->iov_offset;
+
+		while (bytes) {
+			int copy = min(bytes, iov->iov_len - base);
+
+			bytes -= copy;
+			base += copy;
+			if (iov->iov_len == base) {
+				iov++;
+				base = 0;
+			}
+		}
+		i->iov = iov;
+		i->iov_offset = base;
+	}
+}
+
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	__iov_iter_advance_iov(i, bytes);
+	i->count -= bytes;
+}
+
+int iov_iter_fault_in_readable(struct iov_iter *i)
+{
+	size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count);
+	char __user *buf = i->iov->iov_base + i->iov_offset;
+	return fault_in_pages_readable(buf, seglen);
+}
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(struct iov_iter *i)
+{
+	const struct iovec *iov = i->iov;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, iov->iov_len - i->iov_offset);
+}
+
+/*
  * Performs necessary checks before doing a write
  *
  * Can adjust writing position or amount of bytes to write.
@@ -2022,30 +2125,22 @@ generic_file_buffered_write(struct kiocb
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
-	const struct iovec *cur_iov = iov; /* current iovec */
-	size_t		iov_offset = 0;	   /* offset in the current iovec */
-	char __user	*buf;
+	struct iov_iter i;
 
-	/*
-	 * handle partial DIO write.  Adjust cur_iov if needed.
-	 */
-	filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, written);
+	iov_iter_init(&i, iov, nr_segs, count, written);
 
 	do {
 		struct page *src_page;
 		struct page *page;
 		pgoff_t index;		/* Pagecache index for current page */
 		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long seglen;	/* Bytes remaining in current iovec */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
 
-		buf = cur_iov->iov_base + iov_offset;
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		index = pos >> PAGE_CACHE_SHIFT;
-		bytes = PAGE_CACHE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
+		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iov_iter_count(&i));
 
 		/*
 		 * a non-NULL src_page indicates that we're doing the
@@ -2053,10 +2148,6 @@ generic_file_buffered_write(struct kiocb
 		 */
 		src_page = NULL;
 
-		seglen = cur_iov->iov_len - iov_offset;
-		if (seglen > bytes)
-			seglen = bytes;
-
 		/*
 		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
@@ -2067,7 +2158,7 @@ generic_file_buffered_write(struct kiocb
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
-		if (unlikely(fault_in_pages_readable(buf, seglen))) {
+		if (unlikely(iov_iter_fault_in_readable(&i))) {
 			status = -EFAULT;
 			break;
 		}
@@ -2098,8 +2189,8 @@ generic_file_buffered_write(struct kiocb
 			 * same reason as we can't take a page fault with a
 			 * page locked (as explained below).
 			 */
-			copied = filemap_copy_from_user(src_page, offset,
-					cur_iov, nr_segs, iov_offset, bytes);
+			copied = iov_iter_copy_from_user(src_page, &i,
+								offset, bytes);
 			if (unlikely(copied == 0)) {
 				status = -EFAULT;
 				page_cache_release(page);
@@ -2137,8 +2228,8 @@ generic_file_buffered_write(struct kiocb
 			 * really matter.
 			 */
 			pagefault_disable();
-			copied = filemap_copy_from_user_atomic(page, offset,
-					cur_iov, nr_segs, iov_offset, bytes);
+			copied = iov_iter_copy_from_user_atomic(page, &i,
+								offset, bytes);
 			pagefault_enable();
 		} else {
 			void *src, *dst;
@@ -2163,10 +2254,9 @@ generic_file_buffered_write(struct kiocb
 		if (src_page)
 			page_cache_release(src_page);
 
+		iov_iter_advance(&i, copied);
 		written += copied;
-		count -= copied;
 		pos += copied;
-		filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, copied);
 
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
@@ -2190,7 +2280,7 @@ fs_write_aop_error:
 			continue;
 		else
 			break;
-	} while (count);
+	} while (iov_iter_count(&i));
 	*ppos = pos;
 
 	/*
Index: linux-2.6/mm/filemap.h
===================================================================
--- linux-2.6.orig/mm/filemap.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *	linux/mm/filemap.h
- *
- * Copyright (C) 1994-1999  Linus Torvalds
- */
-
-#ifndef __FILEMAP_H
-#define __FILEMAP_H
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/uio.h>
-#include <linux/uaccess.h>
-
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
-					const struct iovec *iov,
-					size_t base,
-					size_t bytes);
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then return the number of
- * bytes which were copied.
- */
-static inline size_t
-filemap_copy_from_user_atomic(struct page *page, unsigned long offset,
-			const struct iovec *iov, unsigned long nr_segs,
-			size_t base, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	if (likely(nr_segs == 1)) {
-		int left;
-		char __user *buf = iov->iov_base + base;
-		left = __copy_from_user_inatomic_nocache(kaddr + offset,
-							buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset,
-							iov, base, bytes);
-	}
-	kunmap_atomic(kaddr, KM_USER0);
-
-	return copied;
-}
-
-/*
- * This has the same sideeffects and return value as
- * filemap_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
-			const struct iovec *iov, unsigned long nr_segs,
-			 size_t base, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap(page);
-	if (likely(nr_segs == 1)) {
-		int left;
-		char __user *buf = iov->iov_base + base;
-		left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset,
-							iov, base, bytes);
-	}
-	kunmap(page);
-	return copied;
-}
-
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, unsigned long nr_segs,
-						 size_t *basep, size_t bytes)
-{
-	if (likely(nr_segs == 1)) {
-		*basep += bytes;
-	} else {
-		const struct iovec *iov = *iovp;
-		size_t base = *basep;
-
-		while (bytes) {
-			int copy = min(bytes, iov->iov_len - base);
-
-			bytes -= copy;
-			base += copy;
-			if (iov->iov_len == base) {
-				iov++;
-				base = 0;
-			}
-		}
-		*iovp = iov;
-		*basep = base;
-	}
-}
-#endif

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-14 13:38 [patch 1/5] fs: add an iovec iterator Nick Piggin
@ 2007-03-14 13:38 ` Nick Piggin
  2007-03-14 21:28   ` Dmitriy Monakhov
                     ` (3 more replies)
  2007-03-14 13:38 ` [patch 3/5] fs: convert some simple filesystems Nick Piggin
                   ` (3 subsequent siblings)
  4 siblings, 4 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-14 13:38 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Christoph Hellwig, Nick Piggin, Andrew Morton,
	Mark Fasheh

Introduce write_begin, write_end, and perform_write aops.

These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).

 Documentation/filesystems/Locking |    9 +
 Documentation/filesystems/vfs.txt |   38 ++++++
 drivers/block/loop.c              |   69 ++++-------
 fs/buffer.c                       |  144 ++++++++++++++++++++----
 fs/libfs.c                        |   40 ++++++
 fs/namei.c                        |   47 ++-----
 fs/splice.c                       |  106 +----------------
 include/linux/buffer_head.h       |    7 +
 include/linux/fs.h                |   29 ++++
 include/linux/pagemap.h           |    2 
 mm/filemap.c                      |  228 ++++++++++++++++++++++++++++++++++----
 11 files changed, 494 insertions(+), 225 deletions(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -449,6 +449,17 @@ struct address_space_operations {
 	 */
 	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+
+	int (*write_begin)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, int intr,
+				struct page **pagep, void **fsdata);
+	int (*write_end)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata);
+
+	int (*perform_write)(struct file *, struct address_space *,
+				struct iov_iter *, loff_t);
+
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
 	void (*invalidatepage) (struct page *, unsigned long);
@@ -463,6 +474,18 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 };
 
+/*
+ * pagecache_write_begin/pagecache_write_end must be used by general code
+ * to write into the pagecache.
+ */
+int pagecache_write_begin(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, int intr,
+				struct page **pagep, void **fsdata);
+
+int pagecache_write_end(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata);
+
 struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
@@ -1902,6 +1925,12 @@ extern int simple_prepare_write(struct f
 			unsigned offset, unsigned to);
 extern int simple_commit_write(struct file *file, struct page *page,
 				unsigned offset, unsigned to);
+extern int simple_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, int intr,
+			struct page **pagep, void **fsdata);
+extern int simple_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata);
 
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
Index: linux-2.6/mm/filemap.c
===================================================================
--- linux-2.6.orig/mm/filemap.c
+++ linux-2.6/mm/filemap.c
@@ -2049,6 +2049,88 @@ inline int generic_write_checks(struct f
 }
 EXPORT_SYMBOL(generic_write_checks);
 
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, int intr,
+				struct page **pagep, void **fsdata)
+{
+	const struct address_space_operations *aops = mapping->a_ops;
+
+	if (aops->write_begin)
+		return aops->write_begin(file, mapping, pos, len, intr, pagep, fsdata);
+	else {
+		int ret;
+		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		struct inode *inode = mapping->host;
+		struct page *page;
+again:
+		page = __grab_cache_page(mapping, index);
+		*pagep = page;
+		if (!page)
+			return -ENOMEM;
+
+		if (intr && !PageUptodate(page)) {
+			/*
+			 * There is no way to resolve a short write situation
+			 * for a !Uptodate page (except by double copying in
+			 * the caller done by generic_perform_write_2copy).
+			 *
+			 * Instead, we have to bring it uptodate here.
+			 */
+			ret = aops->readpage(file, page);
+			page_cache_release(page);
+			if (ret) {
+				if (ret == AOP_TRUNCATED_PAGE)
+					goto again;
+				return ret;
+			}
+			goto again;
+		}
+
+		ret = aops->prepare_write(file, page, offset, offset+len);
+		if (ret) {
+			if (ret != AOP_TRUNCATED_PAGE)
+				unlock_page(page);
+			page_cache_release(page);
+			if (pos + len > inode->i_size)
+				vmtruncate(inode, inode->i_size);
+			if (ret == AOP_TRUNCATED_PAGE)
+				goto again;
+		}
+		return ret;
+	}
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	const struct address_space_operations *aops = mapping->a_ops;
+	int ret;
+
+	if (aops->write_begin)
+		ret = aops->write_end(file, mapping, pos, len, copied, page, fsdata);
+	else {
+		int ret;
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		struct inode *inode = mapping->host;
+
+		flush_dcache_page(page);
+		ret = aops->commit_write(file, page, offset, offset+len);
+		if (ret < 0) {
+			unlock_page(page);
+			page_cache_release(page);
+			if (pos + len > inode->i_size)
+				vmtruncate(inode, inode->i_size);
+		} else
+			ret = copied;
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL(pagecache_write_end);
+
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -2092,8 +2174,7 @@ EXPORT_SYMBOL(generic_file_direct_write)
  * Find or create a page at the given pagecache position. Return the locked
  * page. This function is specifically for buffered writes.
  */
-static struct page *__grab_cache_page(struct address_space *mapping,
-							pgoff_t index)
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
 {
 	int status;
 	struct page *page;
@@ -2115,19 +2196,14 @@ repeat:
 	return page;
 }
 
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, ssize_t written)
+static ssize_t generic_perform_write_2copy(struct file *file,
+				struct iov_iter *i, loff_t pos)
 {
-	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode 	*inode = mapping->host;
-	long		status = 0;
-	struct iov_iter i;
-
-	iov_iter_init(&i, iov, nr_segs, count, written);
+	struct inode *inode = mapping->host;
+	long status = 0;
+	ssize_t written = 0;
 
 	do {
 		struct page *src_page;
@@ -2140,7 +2216,7 @@ generic_file_buffered_write(struct kiocb
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-						iov_iter_count(&i));
+						iov_iter_count(i));
 
 		/*
 		 * a non-NULL src_page indicates that we're doing the
@@ -2158,7 +2234,7 @@ generic_file_buffered_write(struct kiocb
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
-		if (unlikely(iov_iter_fault_in_readable(&i))) {
+		if (unlikely(iov_iter_fault_in_readable(i))) {
 			status = -EFAULT;
 			break;
 		}
@@ -2189,7 +2265,7 @@ generic_file_buffered_write(struct kiocb
 			 * same reason as we can't take a page fault with a
 			 * page locked (as explained below).
 			 */
-			copied = iov_iter_copy_from_user(src_page, &i,
+			copied = iov_iter_copy_from_user(src_page, i,
 								offset, bytes);
 			if (unlikely(copied == 0)) {
 				status = -EFAULT;
@@ -2228,7 +2304,7 @@ generic_file_buffered_write(struct kiocb
 			 * really matter.
 			 */
 			pagefault_disable();
-			copied = iov_iter_copy_from_user_atomic(page, &i,
+			copied = iov_iter_copy_from_user_atomic(page, i,
 								offset, bytes);
 			pagefault_enable();
 		} else {
@@ -2254,9 +2330,9 @@ generic_file_buffered_write(struct kiocb
 		if (src_page)
 			page_cache_release(src_page);
 
-		iov_iter_advance(&i, copied);
-		written += copied;
+		iov_iter_advance(i, copied);
 		pos += copied;
+		written += copied;
 
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
@@ -2280,13 +2356,119 @@ fs_write_aop_error:
 			continue;
 		else
 			break;
-	} while (iov_iter_count(&i));
-	*ppos = pos;
+	} while (iov_iter_count(i));
+
+	return written ? written : status;
+}
+
+static ssize_t generic_perform_write(struct file *file,
+				struct iov_iter *i, loff_t pos)
+{
+	struct address_space *mapping = file->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	long status = 0;
+	ssize_t written = 0;
+
+	do {
+		struct page *page;
+		pgoff_t index;		/* Pagecache index for current page */
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+		void *fsdata;
+
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iov_iter_count(i));
+
+again:
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i))) {
+			status = -EFAULT;
+			break;
+		}
+
+		status = a_ops->write_begin(file, mapping, pos, bytes, 1,
+						&page, &fsdata);
+		if (unlikely(status))
+			break;
+
+		pagefault_disable();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		pagefault_enable();
+		flush_dcache_page(page);
+
+		status = a_ops->write_end(file, mapping, pos, bytes, copied,
+						page, fsdata);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+
+		cond_resched();
+
+		if (unlikely(copied == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iov_iter_single_seg_count(i));
+			goto again;
+		}
+		iov_iter_advance(i, copied);
+		pos += copied;
+		written += copied;
+
+		balance_dirty_pages_ratelimited(mapping);
+
+	} while (iov_iter_count(i));
+
+	return written ? written : status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, ssize_t written)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	struct iov_iter i;
+
+	iov_iter_init(&i, iov, nr_segs, count, written);
+	if (a_ops->perform_write)
+		status = a_ops->perform_write(file, mapping, &i, pos);
+	else if (a_ops->write_begin)
+		status = generic_perform_write(file, &i, pos);
+	else
+		status = generic_perform_write_2copy(file, &i, pos);
 
-	/*
-	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
-	 */
 	if (likely(status >= 0)) {
+		written += status;
+		*ppos = pos + status;
+
+		/*
+		 * For now, when the user asks for O_SYNC, we'll actually give
+		 * O_DSYNC
+		 */
 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
 				status = generic_osync_inode(inode, mapping,
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c
+++ linux-2.6/fs/buffer.c
@@ -1846,36 +1846,50 @@ static int __block_prepare_write(struct 
 		} while ((bh = bh->b_this_page) != head);
 		return 0;
 	}
+
 	/* Error case: */
-	/*
-	 * Zero out any newly allocated blocks to avoid exposing stale
-	 * data.  If BH_New is set, we know that the block was newly
-	 * allocated in the above loop.
-	 */
-	bh = head;
+	page_zero_new_buffers(page, from, to);
+	return err;
+}
+
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
 	block_start = 0;
 	do {
-		block_end = block_start+blocksize;
-		if (block_end <= from)
-			goto next_bh;
-		if (block_start >= to)
-			break;
+		block_end = block_start + bh->b_size;
+
 		if (buffer_new(bh)) {
-			void *kaddr;
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, end;
+					void *kaddr;
+
+					start = max(from, block_start);
+					end = min(to, block_end);
+
+					kaddr = kmap_atomic(page, KM_USER0);
+					memset(kaddr+start, 0, block_end-end);
+					flush_dcache_page(page);
+					kunmap_atomic(kaddr, KM_USER0);
+					set_buffer_uptodate(bh);
+				}
 
-			clear_buffer_new(bh);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr+block_start, 0, bh->b_size);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
 		}
-next_bh:
+
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
-	return err;
 }
 
 static int __block_commit_write(struct inode *inode, struct page *page,
@@ -1899,6 +1913,7 @@ static int __block_commit_write(struct i
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
+		clear_buffer_new(bh);
 	}
 
 	/*
@@ -1912,6 +1927,93 @@ static int __block_commit_write(struct i
 	return 0;
 }
 
+int block_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, int intr,
+			struct page **pagep, void **fsdata,
+			get_block_t *get_block)
+{
+	struct inode *inode = mapping->host;
+	int status = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned start, end;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + len;
+
+	page = *pagep;
+	if (page == NULL) {
+		page = __grab_cache_page(mapping, index);
+		if (!page) {
+			status = -ENOMEM;
+			goto out;
+		}
+		*pagep = page;
+	}
+
+	status = __block_prepare_write(inode, page, start, end, get_block);
+	if (unlikely(status)) {
+		ClearPageUptodate(page);
+
+		unlock_page(page);
+		page_cache_release(page);
+
+		/*
+		 * prepare_write() may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
+		goto out;
+	}
+
+out:
+	return status;
+}
+
+int block_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+
+	if (unlikely(copied < len))
+		page_zero_new_buffers(page, start+copied, start+len);
+	flush_dcache_page(page);
+
+	/* This could be a short (even 0-length) commit */
+	__block_commit_write(inode, page, start, start+copied);
+
+	/*
+	 * The buffers that were written will now be uptodate, so we don't
+	 * have to worry about a readpage reading them again.
+	 * XXX: however I'm not sure about partial writes to buffers. Must
+	 * recheck that, and hopefully we can remove the below test.
+	 */
+	if (!PageUptodate(page))
+		copied = 0;
+
+	unlock_page(page);
+	mark_page_accessed(page); /* XXX: put this in caller? */
+	page_cache_release(page);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 */
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		mark_inode_dirty(inode);
+	}
+
+	return copied;
+}
+
 /*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
Index: linux-2.6/include/linux/buffer_head.h
===================================================================
--- linux-2.6.orig/include/linux/buffer_head.h
+++ linux-2.6/include/linux/buffer_head.h
@@ -202,6 +202,13 @@ void block_invalidatepage(struct page *p
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
+int block_write_begin(struct file *, struct address_space *,
+				loff_t, unsigned, int,
+				struct page **, void **, get_block_t*);
+int block_write_end(struct file *, struct address_space *,
+				loff_t, unsigned, unsigned,
+				struct page *, void *);
+void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 				loff_t *);
Index: linux-2.6/include/linux/pagemap.h
===================================================================
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -85,6 +85,8 @@ unsigned find_get_pages_contig(struct ad
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
 
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
+
 /*
  * Returns locked page at given index in given cache, creating it if needed.
  */
Index: linux-2.6/fs/libfs.c
===================================================================
--- linux-2.6.orig/fs/libfs.c
+++ linux-2.6/fs/libfs.c
@@ -342,6 +342,24 @@ int simple_prepare_write(struct file *fi
 	return 0;
 }
 
+int simple_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, int intr,
+			struct page **pagep, void **fsdata)
+{
+	struct page *page;
+	pgoff_t index;
+	unsigned from;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+
+	return simple_prepare_write(file, page, from, from+len);
+}
+
 int simple_commit_write(struct file *file, struct page *page,
 			unsigned from, unsigned to)
 {
@@ -360,6 +378,28 @@ int simple_commit_write(struct file *fil
 	return 0;
 }
 
+int simple_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len) {
+		void *kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + from + copied, 0, len - copied);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	simple_commit_write(file, page, from, from+copied);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
 int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
 {
 	struct inode *inode;
Index: linux-2.6/drivers/block/loop.c
===================================================================
--- linux-2.6.orig/drivers/block/loop.c
+++ linux-2.6/drivers/block/loop.c
@@ -206,11 +206,10 @@ lo_do_transfer(struct loop_device *lo, i
  * space operations prepare_write and commit_write.
  */
 static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
-		int bsize, loff_t pos, struct page *page)
+		int bsize, loff_t pos, struct page *unused)
 {
 	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
 	struct address_space *mapping = file->f_mapping;
-	const struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	unsigned offset, bv_offs;
 	int len, ret;
@@ -222,67 +221,47 @@ static int do_lo_send_aops(struct loop_d
 	len = bvec->bv_len;
 	while (len > 0) {
 		sector_t IV;
-		unsigned size;
+		unsigned size, copied;
 		int transfer_result;
+		struct page *page;
+		void *fsdata;
 
 		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
 		size = PAGE_CACHE_SIZE - offset;
 		if (size > len)
 			size = len;
-		page = grab_cache_page(mapping, index);
-		if (unlikely(!page))
+
+		ret = pagecache_write_begin(file, mapping, pos, size, 1,
+							&page, &fsdata);
+		if (ret)
 			goto fail;
-		ret = aops->prepare_write(file, page, offset,
-					  offset + size);
-		if (unlikely(ret)) {
-			if (ret == AOP_TRUNCATED_PAGE) {
-				page_cache_release(page);
-				continue;
-			}
-			goto unlock;
-		}
+
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
-		if (unlikely(transfer_result)) {
-			char *kaddr;
+		copied = size;
+		if (unlikely(transfer_result))
+			copied = 0;
+
+		ret = pagecache_write_end(file, mapping, pos, size, copied,
+							page, fsdata);
+		if (ret < 0)
+			goto fail;
+		if (ret < copied)
+			copied = ret;
 
-			/*
-			 * The transfer failed, but we still write the data to
-			 * keep prepare/commit calls balanced.
-			 */
-			printk(KERN_ERR "loop: transfer error block %llu\n",
-			       (unsigned long long)index);
-			kaddr = kmap_atomic(page, KM_USER0);
-			memset(kaddr + offset, 0, size);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
-		flush_dcache_page(page);
-		ret = aops->commit_write(file, page, offset,
-					 offset + size);
-		if (unlikely(ret)) {
-			if (ret == AOP_TRUNCATED_PAGE) {
-				page_cache_release(page);
-				continue;
-			}
-			goto unlock;
-		}
 		if (unlikely(transfer_result))
-			goto unlock;
-		bv_offs += size;
-		len -= size;
+			goto fail;
+
+		bv_offs += copied;
+		len -= copied;
 		offset = 0;
 		index++;
-		pos += size;
-		unlock_page(page);
-		page_cache_release(page);
+		pos += copied;
 	}
 	ret = 0;
 out:
 	mutex_unlock(&mapping->host->i_mutex);
 	return ret;
-unlock:
-	unlock_page(page);
-	page_cache_release(page);
 fail:
 	ret = -1;
 	goto out;
Index: linux-2.6/fs/namei.c
===================================================================
--- linux-2.6.orig/fs/namei.c
+++ linux-2.6/fs/namei.c
@@ -2688,53 +2688,30 @@ int __page_symlink(struct inode *inode, 
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
+	void *fsdata;
 	int err;
 	char *kaddr;
 
 retry:
-	err = -ENOMEM;
-	page = find_or_create_page(mapping, 0, gfp_mask);
-	if (!page)
-		goto fail;
-	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
-	if (err == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry;
-	}
+	err = pagecache_write_begin(NULL, mapping, 0, PAGE_CACHE_SIZE, 0,
+							&page, &fsdata);
 	if (err)
-		goto fail_map;
+		goto fail;
+
 	kaddr = kmap_atomic(page, KM_USER0);
 	memcpy(kaddr, symname, len-1);
+	memset(kaddr+len-1, 0, PAGE_CACHE_SIZE-(len-1));
 	kunmap_atomic(kaddr, KM_USER0);
-	err = mapping->a_ops->commit_write(NULL, page, 0, len-1);
-	if (err == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry;
-	}
-	if (err)
-		goto fail_map;
-	/*
-	 * Notice that we are _not_ going to block here - end of page is
-	 * unmapped, so this will only try to map the rest of page, see
-	 * that it is unmapped (typically even will not look into inode -
-	 * ->i_size will be enough for everything) and zero it out.
-	 * OTOH it's obviously correct and should make the page up-to-date.
-	 */
-	if (!PageUptodate(page)) {
-		err = mapping->a_ops->readpage(NULL, page);
-		if (err != AOP_TRUNCATED_PAGE)
-			wait_on_page_locked(page);
-	} else {
-		unlock_page(page);
-	}
-	page_cache_release(page);
+
+	err = pagecache_write_end(NULL, mapping, 0, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE,
+							page, fsdata);
 	if (err < 0)
 		goto fail;
+	if (err < PAGE_CACHE_SIZE)
+		goto retry;
+
 	mark_inode_dirty(inode);
 	return 0;
-fail_map:
-	unlock_page(page);
-	page_cache_release(page);
 fail:
 	return err;
 }
Index: linux-2.6/fs/splice.c
===================================================================
--- linux-2.6.orig/fs/splice.c
+++ linux-2.6/fs/splice.c
@@ -559,7 +559,7 @@ static int pipe_to_file(struct pipe_inod
 	struct address_space *mapping = file->f_mapping;
 	unsigned int offset, this_len;
 	struct page *page;
-	pgoff_t index;
+	void *fsdata;
 	int ret;
 
 	/*
@@ -569,13 +569,13 @@ static int pipe_to_file(struct pipe_inod
 	if (unlikely(ret))
 		return ret;
 
-	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
 	this_len = sd->len;
 	if (this_len + offset > PAGE_CACHE_SIZE)
 		this_len = PAGE_CACHE_SIZE - offset;
 
+#if 0
 	/*
 	 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
 	 * page.
@@ -587,86 +587,11 @@ static int pipe_to_file(struct pipe_inod
 		 * locked on successful return.
 		 */
 		if (buf->ops->steal(pipe, buf))
-			goto find_page;
+#endif
 
-		page = buf->page;
-		if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) {
-			unlock_page(page);
-			goto find_page;
-		}
-
-		page_cache_get(page);
-
-		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
-			lru_cache_add(page);
-	} else {
-find_page:
-		page = find_lock_page(mapping, index);
-		if (!page) {
-			ret = -ENOMEM;
-			page = page_cache_alloc_cold(mapping);
-			if (unlikely(!page))
-				goto out_ret;
-
-			/*
-			 * This will also lock the page
-			 */
-			ret = add_to_page_cache_lru(page, mapping, index,
-						    GFP_KERNEL);
-			if (unlikely(ret))
-				goto out;
-		}
-
-		/*
-		 * We get here with the page locked. If the page is also
-		 * uptodate, we don't need to do more. If it isn't, we
-		 * may need to bring it in if we are not going to overwrite
-		 * the full page.
-		 */
-		if (!PageUptodate(page)) {
-			if (this_len < PAGE_CACHE_SIZE) {
-				ret = mapping->a_ops->readpage(file, page);
-				if (unlikely(ret))
-					goto out;
-
-				lock_page(page);
-
-				if (!PageUptodate(page)) {
-					/*
-					 * Page got invalidated, repeat.
-					 */
-					if (!page->mapping) {
-						unlock_page(page);
-						page_cache_release(page);
-						goto find_page;
-					}
-					ret = -EIO;
-					goto out;
-				}
-			} else
-				SetPageUptodate(page);
-		}
-	}
-
-	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
-	if (unlikely(ret)) {
-		loff_t isize = i_size_read(mapping->host);
-
-		if (ret != AOP_TRUNCATED_PAGE)
-			unlock_page(page);
-		page_cache_release(page);
-		if (ret == AOP_TRUNCATED_PAGE)
-			goto find_page;
-
-		/*
-		 * prepare_write() may have instantiated a few blocks
-		 * outside i_size.  Trim these off again.
-		 */
-		if (sd->pos + this_len > isize)
-			vmtruncate(mapping->host, isize);
-
-		goto out_ret;
-	}
+	ret = pagecache_write_begin(file, mapping, sd->pos, sd->len, 0, &page, &fsdata);
+	if (unlikely(ret))
+		goto out;
 
 	if (buf->page != page) {
 		/*
@@ -676,28 +601,13 @@ find_page:
 		char *dst = kmap_atomic(page, KM_USER1);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
-		flush_dcache_page(page);
 		kunmap_atomic(dst, KM_USER1);
 		buf->ops->unmap(pipe, buf, src);
 	}
 
-	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
-	if (!ret) {
-		/*
-		 * Return the number of bytes written and mark page as
-		 * accessed, we are now done!
-		 */
-		ret = this_len;
-		mark_page_accessed(page);
-		balance_dirty_pages_ratelimited(mapping);
-	} else if (ret == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto find_page;
-	}
+	ret = pagecache_write_end(file, mapping, sd->pos, sd->len, sd->len, page, fsdata);
+
 out:
-	page_cache_release(page);
-	unlock_page(page);
-out_ret:
 	return ret;
 }
 
Index: linux-2.6/Documentation/filesystems/Locking
===================================================================
--- linux-2.6.orig/Documentation/filesystems/Locking
+++ linux-2.6/Documentation/filesystems/Locking
@@ -176,15 +176,18 @@ prototypes:
 locking rules:
 	All except set_page_dirty may block
 
-			BKL	PageLocked(page)
+			BKL	PageLocked(page)	i_sem
 writepage:		no	yes, unlocks (see below)
 readpage:		no	yes, unlocks
 sync_page:		no	maybe
 writepages:		no
 set_page_dirty		no	no
 readpages:		no
-prepare_write:		no	yes
-commit_write:		no	yes
+prepare_write:		no	yes			yes
+commit_write:		no	yes			yes
+write_begin:		no	locks the page		yes
+write_end:		no	yes, unlocks		yes
+perform_write:		no	n/a			yes
 bmap:			yes
 invalidatepage:		no	yes
 releasepage:		no	yes
Index: linux-2.6/Documentation/filesystems/vfs.txt
===================================================================
--- linux-2.6.orig/Documentation/filesystems/vfs.txt
+++ linux-2.6/Documentation/filesystems/vfs.txt
@@ -534,6 +534,14 @@ struct address_space_operations {
 			struct list_head *pages, unsigned nr_pages);
 	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+	int (*write_begin)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, int intr,
+				struct page **pagep, void **fsdata);
+	int (*write_end)(struct file *, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata);
+	int (*perform_write)(struct file *, struct address_space *,
+				struct iov_iter *, loff_t);
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
@@ -629,6 +637,36 @@ struct address_space_operations {
         operations.  It should avoid returning an error if possible -
         errors should have been handled by prepare_write.
 
+  write_begin: This is intended as a replacement for prepare_write. Called
+        by the generic buffered write code to ask the filesystem to prepare
+        to write len bytes at the given offset in the file. intr is a boolean
+        that specifies whether or not the filesystem must be prepared to
+        handle a short write (ie. write_end called with less than len bytes
+        written).
+
+        The filesystem must return the locked pagecache page for the caller
+        to write into.
+
+        A void * may be returned in fsdata, which then gets passed into
+        write_end.
+
+        Returns < 0 on failure, in which case all cleanup must be done and
+        write_end not called. 0 on success, in which case write_end must
+        be called.
+
+  write_end: After a successful write_begin, and data copy, write_end must
+        be called. len is the original len passed to write_begin, and copied
+        is the amount that was able to be copied (they must be equal if
+        write_begin was called with intr == 0).
+
+        The filesystem must take care of unlocking the page and dropping its
+        refcount, and updating i_size.
+
+        Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
+        that were able to be copied into the file.
+
+  perform_write:
+
   bmap: called by the VFS to map a logical block offset within object to
   	physical block number. This method is used by the FIBMAP
   	ioctl and for working with swap-files.  To be able to swap to

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 3/5] fs: convert some simple filesystems
  2007-03-14 13:38 [patch 1/5] fs: add an iovec iterator Nick Piggin
  2007-03-14 13:38 ` [patch 2/5] fs: introduce new aops and infrastructure Nick Piggin
@ 2007-03-14 13:38 ` Nick Piggin
  2007-03-14 13:38 ` [patch 4/5] ext2: convert to new aops Nick Piggin
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-14 13:38 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Christoph Hellwig, Nick Piggin, Andrew Morton,
	Mark Fasheh

Implement new aops for some of the simpler filesystems.

 fs/configfs/inode.c |    4 ++--
 fs/sysfs/inode.c    |    4 ++--
 mm/shmem.c          |   16 ++++++++++------
 3 files changed, 14 insertions(+), 10 deletions(-)

Index: linux-2.6/mm/shmem.c
===================================================================
--- linux-2.6.orig/mm/shmem.c
+++ linux-2.6/mm/shmem.c
@@ -1419,10 +1419,14 @@ static const struct inode_operations shm
  * lets a tmpfs file be used read-write below the loop driver.
  */
 static int
-shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+shmem_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, int intr,
+			struct page **pagep, void **fsdata)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	*pagep = NULL;
+	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
 }
 
 static ssize_t
@@ -2319,8 +2323,8 @@ static const struct address_space_operat
 	.writepage	= shmem_writepage,
 	.set_page_dirty	= __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
-	.prepare_write	= shmem_prepare_write,
-	.commit_write	= simple_commit_write,
+	.write_begin	= shmem_write_begin,
+	.write_end	= simple_write_end,
 #endif
 	.migratepage	= migrate_page,
 };
Index: linux-2.6/fs/configfs/inode.c
===================================================================
--- linux-2.6.orig/fs/configfs/inode.c
+++ linux-2.6/fs/configfs/inode.c
@@ -40,8 +40,8 @@ extern struct super_block * configfs_sb;
 
 static const struct address_space_operations configfs_aops = {
 	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
 };
 
 static struct backing_dev_info configfs_backing_dev_info = {
Index: linux-2.6/fs/sysfs/inode.c
===================================================================
--- linux-2.6.orig/fs/sysfs/inode.c
+++ linux-2.6/fs/sysfs/inode.c
@@ -20,8 +20,8 @@ extern struct super_block * sysfs_sb;
 
 static const struct address_space_operations sysfs_aops = {
 	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
 };
 
 static struct backing_dev_info sysfs_backing_dev_info = {

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 4/5] ext2: convert to new aops
  2007-03-14 13:38 [patch 1/5] fs: add an iovec iterator Nick Piggin
  2007-03-14 13:38 ` [patch 2/5] fs: introduce new aops and infrastructure Nick Piggin
  2007-03-14 13:38 ` [patch 3/5] fs: convert some simple filesystems Nick Piggin
@ 2007-03-14 13:38 ` Nick Piggin
  2007-03-14 13:38 ` [patch 5/5] ext3: " Nick Piggin
  2007-03-14 13:51 ` [patch 1/5] fs: add an iovec iterator Nick Piggin
  4 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-14 13:38 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Christoph Hellwig, Nick Piggin, Andrew Morton,
	Mark Fasheh

Implement new aops for ext2.

 fs/ext2/inode.c |   12 ++++++++++++
 1 file changed, 12 insertions(+)

Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c
+++ linux-2.6/fs/ext2/inode.c
@@ -643,6 +643,16 @@ ext2_readpages(struct file *file, struct
 }
 
 static int
+ext2_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, int intr,
+		struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+	return block_write_begin(file, mapping, pos, len, intr, pagep, fsdata,
+							ext2_get_block);
+}
+
+static int
 ext2_prepare_write(struct file *file, struct page *page,
 			unsigned from, unsigned to)
 {
@@ -689,6 +699,8 @@ const struct address_space_operations ex
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
 	.sync_page		= block_sync_page,
+	.write_begin		= ext2_write_begin,
+	.write_end		= block_write_end,
 	.prepare_write		= ext2_prepare_write,
 	.commit_write		= generic_commit_write,
 	.bmap			= ext2_bmap,

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [patch 5/5] ext3: convert to new aops
  2007-03-14 13:38 [patch 1/5] fs: add an iovec iterator Nick Piggin
                   ` (2 preceding siblings ...)
  2007-03-14 13:38 ` [patch 4/5] ext2: convert to new aops Nick Piggin
@ 2007-03-14 13:38 ` Nick Piggin
  2007-03-14 13:51 ` [patch 1/5] fs: add an iovec iterator Nick Piggin
  4 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-14 13:38 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Christoph Hellwig, Nick Piggin, Andrew Morton,
	Mark Fasheh

Implement new aops for ext3. Probably has some bugs in interaction with
journalling, and corner cases aren't tested/thought out fully, but it
boots and runs. I don't see a fundamental reason why it can't work...

 fs/ext3/inode.c |  137 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 88 insertions(+), 49 deletions(-)

Index: linux-2.6/fs/ext3/inode.c
===================================================================
--- linux-2.6.orig/fs/ext3/inode.c
+++ linux-2.6/fs/ext3/inode.c
@@ -1155,7 +1155,7 @@ static int do_journal_get_write_access(h
  * This content is expected to be set to zeroes by block_prepare_write().
  * 2006/10/14  SAW
  */
-static int ext3_prepare_failure(struct file *file, struct page *page,
+static int ext3_write_failure(struct file *file, struct page *page,
 				unsigned from, unsigned to)
 {
 	struct address_space *mapping;
@@ -1208,29 +1208,40 @@ skip:
 	return mapping->a_ops->commit_write(file, page, from, block_start);
 }
 
-static int ext3_prepare_write(struct file *file, struct page *page,
-			      unsigned from, unsigned to)
+static int ext3_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, int intr,
+				struct page **pagep, void **fsdata)
 {
-	struct inode *inode = page->mapping->host;
-	int ret, ret2;
+	struct inode *inode = mapping->host;
 	int needed_blocks = ext3_writepage_trans_blocks(inode);
+	int ret, ret2;
 	handle_t *handle;
 	int retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned start, end;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	start = pos * (PAGE_CACHE_SIZE - 1);
+	end = start + len;
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
 
 retry:
 	handle = ext3_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
-	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
-		ret = nobh_prepare_write(page, from, to, ext3_get_block);
-	else
-		ret = block_prepare_write(page, from, to, ext3_get_block);
+	ret = block_write_begin(file, mapping, pos, len, intr, pagep, fsdata,
+							ext3_get_block);
 	if (ret)
 		goto failure;
 
 	if (ext3_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
-				from, to, NULL, do_journal_get_write_access);
+				start, end, NULL, do_journal_get_write_access);
 		if (ret)
 			/* fatal error, just put the handle and return */
 			journal_stop(handle);
@@ -1238,7 +1249,7 @@ retry:
 	return ret;
 
 failure:
-	ret2 = ext3_prepare_failure(file, page, from, to);
+	ret2 = ext3_write_failure(file, page, start, end);
 	if (ret2 < 0)
 		return ret2;
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
@@ -1247,17 +1258,18 @@ failure:
 	return ret;
 }
 
+
 int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 {
 	int err = journal_dirty_data(handle, bh);
 	if (err)
 		ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
-						bh, handle,err);
+						bh, handle, err);
 	return err;
 }
 
-/* For commit_write() in data=journal mode */
-static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+/* For write_end() in data=journal mode */
+static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
@@ -1272,78 +1284,103 @@ static int commit_write_fn(handle_t *han
  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
-static int ext3_ordered_commit_write(struct file *file, struct page *page,
-			     unsigned from, unsigned to)
+static int ext3_ordered_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext3_journal_current_handle();
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = file->f_mapping->host;
+	unsigned from, to;
 	int ret = 0, ret2;
 
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
 	ret = walk_page_buffers(handle, page_buffers(page),
 		from, to, NULL, ext3_journal_dirty_data);
 
 	if (ret == 0) {
 		/*
-		 * generic_commit_write() will run mark_inode_dirty() if i_size
+		 * block_write_end() will run mark_inode_dirty() if i_size
 		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
 		 * into that.
 		 */
 		loff_t new_i_size;
 
-		new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+		new_i_size = pos + copied;
 		if (new_i_size > EXT3_I(inode)->i_disksize)
 			EXT3_I(inode)->i_disksize = new_i_size;
-		ret = generic_commit_write(file, page, from, to);
+		copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+		if (copied < 0)
+			ret = copied;
 	}
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	return ret;
+	return ret ? ret : copied;
 }
 
-static int ext3_writeback_commit_write(struct file *file, struct page *page,
-			     unsigned from, unsigned to)
+static int ext3_writeback_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext3_journal_current_handle();
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = file->f_mapping->host;
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
-	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	new_i_size = pos + copied;
 	if (new_i_size > EXT3_I(inode)->i_disksize)
 		EXT3_I(inode)->i_disksize = new_i_size;
 
-	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
-		ret = nobh_commit_write(file, page, from, to);
-	else
-		ret = generic_commit_write(file, page, from, to);
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (copied < 0)
+		ret = copied;
 
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	return ret;
+	return ret ? ret : copied;
 }
 
-static int ext3_journalled_commit_write(struct file *file,
-			struct page *page, unsigned from, unsigned to)
+static int ext3_journalled_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext3_journal_current_handle();
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	int partial = 0;
-	loff_t pos;
+	unsigned from, to;
 
-	/*
-	 * Here we duplicate the generic_commit_write() functionality
-	 */
-	pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+	if (copied < len) {
+		if (PageUptodate(page))
+			copied = len;
+		else {
+			/* XXX: don't need to zero new buffers because we abort? */
+			copied = 0;
+			if (!is_handle_aborted(handle))
+				journal_abort_handle(handle);
+			unlock_page(page);
+			page_cache_release(page);
+			goto out;
+		}
+	}
 
 	ret = walk_page_buffers(handle, page_buffers(page), from,
-				to, &partial, commit_write_fn);
+				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
-	if (pos > inode->i_size)
-		i_size_write(inode, pos);
+	unlock_page(page);
+	page_cache_release(page);
+	if (pos+copied > inode->i_size)
+		i_size_write(inode, pos+copied);
 	EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
 	if (inode->i_size > EXT3_I(inode)->i_disksize) {
 		EXT3_I(inode)->i_disksize = inode->i_size;
@@ -1351,10 +1388,12 @@ static int ext3_journalled_commit_write(
 		if (!ret)
 			ret = ret2;
 	}
+
+out:
 	ret2 = ext3_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	return ret;
+	return ret ? ret : copied;
 }
 
 /*
@@ -1612,7 +1651,7 @@ static int ext3_journalled_writepage(str
 			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 
 		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, commit_write_fn);
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
 		if (ret == 0)
 			ret = err;
 		EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
@@ -1772,8 +1811,8 @@ static const struct address_space_operat
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
 	.sync_page	= block_sync_page,
-	.prepare_write	= ext3_prepare_write,
-	.commit_write	= ext3_ordered_commit_write,
+	.write_begin	= ext3_write_begin,
+	.write_end	= ext3_ordered_write_end,
 	.bmap		= ext3_bmap,
 	.invalidatepage	= ext3_invalidatepage,
 	.releasepage	= ext3_releasepage,
@@ -1786,8 +1825,8 @@ static const struct address_space_operat
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_writeback_writepage,
 	.sync_page	= block_sync_page,
-	.prepare_write	= ext3_prepare_write,
-	.commit_write	= ext3_writeback_commit_write,
+	.write_begin	= ext3_write_begin,
+	.write_end	= ext3_writeback_write_end,
 	.bmap		= ext3_bmap,
 	.invalidatepage	= ext3_invalidatepage,
 	.releasepage	= ext3_releasepage,
@@ -1800,8 +1839,8 @@ static const struct address_space_operat
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_journalled_writepage,
 	.sync_page	= block_sync_page,
-	.prepare_write	= ext3_prepare_write,
-	.commit_write	= ext3_journalled_commit_write,
+	.write_begin	= ext3_write_begin,
+	.write_end	= ext3_journalled_write_end,
 	.set_page_dirty	= ext3_journalled_set_page_dirty,
 	.bmap		= ext3_bmap,
 	.invalidatepage	= ext3_invalidatepage,

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 1/5] fs: add an iovec iterator
  2007-03-14 13:38 [patch 1/5] fs: add an iovec iterator Nick Piggin
                   ` (3 preceding siblings ...)
  2007-03-14 13:38 ` [patch 5/5] ext3: " Nick Piggin
@ 2007-03-14 13:51 ` Nick Piggin
  4 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-14 13:51 UTC (permalink / raw)
  To: Linux Filesystems
  Cc: Linux Kernel, Christoph Hellwig, Andrew Morton, Mark Fasheh

On Wed, Mar 14, 2007 at 02:38:12PM +0100, Nick Piggin wrote:
> Add an iterator data structure to operate over an iovec. Add usercopy
> operators needed by generic_file_buffered_write, and convert that function
> over.

Just a note to anyone looking at these -- they don't apply to any tree,
and I'm posting them at this stage mainly to get feedback on the a_ops
APIs. Comments from anyone welcome.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-14 13:38 ` [patch 2/5] fs: introduce new aops and infrastructure Nick Piggin
@ 2007-03-14 21:28   ` Dmitriy Monakhov
  2007-03-15  3:55     ` Nick Piggin
       [not found]   ` <200703142246.27167.m.kozlowski@tuxland.pl>
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 19+ messages in thread
From: Dmitriy Monakhov @ 2007-03-14 21:28 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linux Filesystems, Linux Kernel, Christoph Hellwig, Andrew Morton,
	Mark Fasheh

Nick Piggin <npiggin@suse.de> writes:

> Introduce write_begin, write_end, and perform_write aops.
>
> These are intended to replace prepare_write and commit_write with more
> flexible alternatives that are also able to avoid the buffered write
> deadlock problems efficiently (which prepare_write is unable to do).
>
>  Documentation/filesystems/Locking |    9 +
>  Documentation/filesystems/vfs.txt |   38 ++++++
>  drivers/block/loop.c              |   69 ++++-------
>  fs/buffer.c                       |  144 ++++++++++++++++++++----
>  fs/libfs.c                        |   40 ++++++
>  fs/namei.c                        |   47 ++-----
>  fs/splice.c                       |  106 +----------------
>  include/linux/buffer_head.h       |    7 +
>  include/linux/fs.h                |   29 ++++
>  include/linux/pagemap.h           |    2 
>  mm/filemap.c                      |  228 ++++++++++++++++++++++++++++++++++----
>  11 files changed, 494 insertions(+), 225 deletions(-)
>
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h
> +++ linux-2.6/include/linux/fs.h
> @@ -449,6 +449,17 @@ struct address_space_operations {
>  	 */
>  	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
>  	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> +
> +	int (*write_begin)(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, int intr,
> +				struct page **pagep, void **fsdata);
> +	int (*write_end)(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, unsigned copied,
> +				struct page *page, void *fsdata);
> +
> +	int (*perform_write)(struct file *, struct address_space *,
> +				struct iov_iter *, loff_t);
> +
>  	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
>  	sector_t (*bmap)(struct address_space *, sector_t);
>  	void (*invalidatepage) (struct page *, unsigned long);
> @@ -463,6 +474,18 @@ struct address_space_operations {
>  	int (*launder_page) (struct page *);
>  };
>  
> +/*
> + * pagecache_write_begin/pagecache_write_end must be used by general code
> + * to write into the pagecache.
> + */
> +int pagecache_write_begin(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, int intr,
> +				struct page **pagep, void **fsdata);
> +
> +int pagecache_write_end(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, unsigned copied,
> +				struct page *page, void *fsdata);
> +
>  struct backing_dev_info;
>  struct address_space {
>  	struct inode		*host;		/* owner: inode, block_device */
> @@ -1902,6 +1925,12 @@ extern int simple_prepare_write(struct f
>  			unsigned offset, unsigned to);
>  extern int simple_commit_write(struct file *file, struct page *page,
>  				unsigned offset, unsigned to);
> +extern int simple_write_begin(struct file *file, struct address_space *mapping,
> +			loff_t pos, unsigned len, int intr,
> +			struct page **pagep, void **fsdata);
> +extern int simple_write_end(struct file *file, struct address_space *mapping,
> +			loff_t pos, unsigned len, unsigned copied,
> +			struct page *page, void *fsdata);
>  
>  extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
>  extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
> Index: linux-2.6/mm/filemap.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap.c
> +++ linux-2.6/mm/filemap.c
> @@ -2049,6 +2049,88 @@ inline int generic_write_checks(struct f
>  }
>  EXPORT_SYMBOL(generic_write_checks);
>  
> +int pagecache_write_begin(struct file *file, struct address_space *mapping,
> +				loff_t pos, unsigned len, int intr,
> +				struct page **pagep, void **fsdata)
> +{
> +	const struct address_space_operations *aops = mapping->a_ops;
> +
> +	if (aops->write_begin)
> +		return aops->write_begin(file, mapping, pos, len, intr, pagep, fsdata);
> +	else {
> +		int ret;
> +		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
> +		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
> +		struct inode *inode = mapping->host;
> +		struct page *page;
> +again:
> +		page = __grab_cache_page(mapping, index);
> +		*pagep = page;
> +		if (!page)
> +			return -ENOMEM;
> +
> +		if (intr && !PageUptodate(page)) {
> +			/*
> +			 * There is no way to resolve a short write situation
> +			 * for a !Uptodate page (except by double copying in
> +			 * the caller done by generic_perform_write_2copy).
> +			 *
> +			 * Instead, we have to bring it uptodate here.
> +			 */
> +			ret = aops->readpage(file, page);
> +			page_cache_release(page);
> +			if (ret) {
> +				if (ret == AOP_TRUNCATED_PAGE)
> +					goto again;
> +				return ret;
> +			}
> +			goto again;
> +		}
> +
> +		ret = aops->prepare_write(file, page, offset, offset+len);
> +		if (ret) {
> +			if (ret != AOP_TRUNCATED_PAGE)
> +				unlock_page(page);
> +			page_cache_release(page);
> +			if (pos + len > inode->i_size)
> +				vmtruncate(inode, inode->i_size);
> +			if (ret == AOP_TRUNCATED_PAGE)
> +				goto again;
> +		}
> +		return ret;
If prepare_write succeed we return with locked and getted page, let's remember it as [1]
> +	}
> +}
> +EXPORT_SYMBOL(pagecache_write_begin);
> +
> +int pagecache_write_end(struct file *file, struct address_space *mapping,
> +				loff_t pos, unsigned len, unsigned copied,
> +				struct page *page, void *fsdata)
> +{
> +	const struct address_space_operations *aops = mapping->a_ops;
> +	int ret;
> +
> +	if (aops->write_begin)
> +		ret = aops->write_end(file, mapping, pos, len, copied, page, fsdata);
> +	else {
> +		int ret;
> +		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
> +		struct inode *inode = mapping->host;
> +
> +		flush_dcache_page(page);
> +		ret = aops->commit_write(file, page, offset, offset+len);
> +		if (ret < 0) {
> +			unlock_page(page);
> +			page_cache_release(page);
> +			if (pos + len > inode->i_size)
> +				vmtruncate(inode, inode->i_size);
> +		} else
> +			ret = copied;
What about AOP_TRUNCATED_PAGE?  Off corse we can't just "goto retry" here :) ,
but we may return it to caller and let's caller handle it.  
> +	}
> +
> +	return copied;
if ->commit_write return non negative value we return with sill locked page  look above at [1] 
may be it will be unlocked by caller? I guess no it was just forgoten.
> +}
> +EXPORT_SYMBOL(pagecache_write_end);
> +
>  ssize_t
>  generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>  		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
> @@ -2092,8 +2174,7 @@ EXPORT_SYMBOL(generic_file_direct_write)
>   * Find or create a page at the given pagecache position. Return the locked
>   * page. This function is specifically for buffered writes.
>   */
> -static struct page *__grab_cache_page(struct address_space *mapping,
> -							pgoff_t index)
> +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
>  {
>  	int status;
>  	struct page *page;
> @@ -2115,19 +2196,14 @@ repeat:
>  	return page;
>  }
>  
> -ssize_t
> -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> -		unsigned long nr_segs, loff_t pos, loff_t *ppos,
> -		size_t count, ssize_t written)
> +static ssize_t generic_perform_write_2copy(struct file *file,
> +				struct iov_iter *i, loff_t pos)
>  {
> -	struct file *file = iocb->ki_filp;
>  	struct address_space *mapping = file->f_mapping;
>  	const struct address_space_operations *a_ops = mapping->a_ops;
> -	struct inode 	*inode = mapping->host;
> -	long		status = 0;
> -	struct iov_iter i;
> -
> -	iov_iter_init(&i, iov, nr_segs, count, written);
> +	struct inode *inode = mapping->host;
> +	long status = 0;
> +	ssize_t written = 0;
>  
>  	do {
>  		struct page *src_page;
> @@ -2140,7 +2216,7 @@ generic_file_buffered_write(struct kiocb
>  		offset = (pos & (PAGE_CACHE_SIZE - 1));
>  		index = pos >> PAGE_CACHE_SHIFT;
>  		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
> -						iov_iter_count(&i));
> +						iov_iter_count(i));
>  
>  		/*
>  		 * a non-NULL src_page indicates that we're doing the
> @@ -2158,7 +2234,7 @@ generic_file_buffered_write(struct kiocb
>  		 * to check that the address is actually valid, when atomic
>  		 * usercopies are used, below.
>  		 */
> -		if (unlikely(iov_iter_fault_in_readable(&i))) {
> +		if (unlikely(iov_iter_fault_in_readable(i))) {
>  			status = -EFAULT;
>  			break;
>  		}
> @@ -2189,7 +2265,7 @@ generic_file_buffered_write(struct kiocb
>  			 * same reason as we can't take a page fault with a
>  			 * page locked (as explained below).
>  			 */
> -			copied = iov_iter_copy_from_user(src_page, &i,
> +			copied = iov_iter_copy_from_user(src_page, i,
>  								offset, bytes);
>  			if (unlikely(copied == 0)) {
>  				status = -EFAULT;
> @@ -2228,7 +2304,7 @@ generic_file_buffered_write(struct kiocb
>  			 * really matter.
>  			 */
>  			pagefault_disable();
> -			copied = iov_iter_copy_from_user_atomic(page, &i,
> +			copied = iov_iter_copy_from_user_atomic(page, i,
>  								offset, bytes);
>  			pagefault_enable();
>  		} else {
> @@ -2254,9 +2330,9 @@ generic_file_buffered_write(struct kiocb
>  		if (src_page)
>  			page_cache_release(src_page);
>  
> -		iov_iter_advance(&i, copied);
> -		written += copied;
> +		iov_iter_advance(i, copied);
>  		pos += copied;
> +		written += copied;
>  
>  		balance_dirty_pages_ratelimited(mapping);
>  		cond_resched();
> @@ -2280,13 +2356,119 @@ fs_write_aop_error:
>  			continue;
>  		else
>  			break;
> -	} while (iov_iter_count(&i));
> -	*ppos = pos;
> +	} while (iov_iter_count(i));
> +
> +	return written ? written : status;
> +}
> +
> +static ssize_t generic_perform_write(struct file *file,
> +				struct iov_iter *i, loff_t pos)
> +{
> +	struct address_space *mapping = file->f_mapping;
> +	const struct address_space_operations *a_ops = mapping->a_ops;
> +	long status = 0;
> +	ssize_t written = 0;
> +
> +	do {
> +		struct page *page;
> +		pgoff_t index;		/* Pagecache index for current page */
> +		unsigned long offset;	/* Offset into pagecache page */
> +		unsigned long bytes;	/* Bytes to write to page */
> +		size_t copied;		/* Bytes copied from user */
> +		void *fsdata;
> +
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +		index = pos >> PAGE_CACHE_SHIFT;
> +		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
> +						iov_iter_count(i));
> +
> +again:
> +
> +		/*
> +		 * Bring in the user page that we will copy from _first_.
> +		 * Otherwise there's a nasty deadlock on copying from the
> +		 * same page as we're writing to, without it being marked
> +		 * up-to-date.
> +		 *
> +		 * Not only is this an optimisation, but it is also required
> +		 * to check that the address is actually valid, when atomic
> +		 * usercopies are used, below.
> +		 */
> +		if (unlikely(iov_iter_fault_in_readable(i))) {
> +			status = -EFAULT;
> +			break;
> +		}
> +
> +		status = a_ops->write_begin(file, mapping, pos, bytes, 1,
> +						&page, &fsdata);
> +		if (unlikely(status))
> +			break;
> +
> +		pagefault_disable();
> +		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
> +		pagefault_enable();
> +		flush_dcache_page(page);
> +
> +		status = a_ops->write_end(file, mapping, pos, bytes, copied,
> +						page, fsdata);
> +		if (unlikely(status < 0))
> +			break;
> +		copied = status;
> +
> +		cond_resched();
> +
> +		if (unlikely(copied == 0)) {
> +			/*
> +			 * If we were unable to copy any data at all, we must
> +			 * fall back to a single segment length write.
> +			 *
> +			 * If we didn't fallback here, we could livelock
> +			 * because not all segments in the iov can be copied at
> +			 * once without a pagefault.
> +			 */
> +			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
> +						iov_iter_single_seg_count(i));
> +			goto again;
> +		}
> +		iov_iter_advance(i, copied);
> +		pos += copied;
> +		written += copied;
WOW caller forgot unlock page too see above at [1].
> +
> +		balance_dirty_pages_ratelimited(mapping);
> +
> +	} while (iov_iter_count(i));
> +
> +	return written ? written : status;
> +}
> +
> +ssize_t
> +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> +		unsigned long nr_segs, loff_t pos, loff_t *ppos,
> +		size_t count, ssize_t written)
> +{
> +	struct file *file = iocb->ki_filp;
> +	struct address_space *mapping = file->f_mapping;
> +	const struct address_space_operations *a_ops = mapping->a_ops;
> +	struct inode 	*inode = mapping->host;
> +	long		status = 0;
> +	struct iov_iter i;
> +
> +	iov_iter_init(&i, iov, nr_segs, count, written);
> +	if (a_ops->perform_write)
> +		status = a_ops->perform_write(file, mapping, &i, pos);
> +	else if (a_ops->write_begin)
> +		status = generic_perform_write(file, &i, pos);
> +	else
> +		status = generic_perform_write_2copy(file, &i, pos);
>  
> -	/*
> -	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
> -	 */
>  	if (likely(status >= 0)) {
> +		written += status;
> +		*ppos = pos + status;
> +
> +		/*
> +		 * For now, when the user asks for O_SYNC, we'll actually give
> +		 * O_DSYNC
> +		 */
>  		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
>  			if (!a_ops->writepage || !is_sync_kiocb(iocb))
>  				status = generic_osync_inode(inode, mapping,
> Index: linux-2.6/fs/buffer.c
> ===================================================================
> --- linux-2.6.orig/fs/buffer.c
> +++ linux-2.6/fs/buffer.c
> @@ -1846,36 +1846,50 @@ static int __block_prepare_write(struct 
>  		} while ((bh = bh->b_this_page) != head);
>  		return 0;
>  	}
> +
>  	/* Error case: */
> -	/*
> -	 * Zero out any newly allocated blocks to avoid exposing stale
> -	 * data.  If BH_New is set, we know that the block was newly
> -	 * allocated in the above loop.
> -	 */
> -	bh = head;
> +	page_zero_new_buffers(page, from, to);
> +	return err;
> +}
> +
> +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
> +{
> +	unsigned int block_start, block_end;
> +	struct buffer_head *head, *bh;
> +
> +	BUG_ON(!PageLocked(page));
> +	if (!page_has_buffers(page))
> +		return;__block_prepare_write 
> +
> +	bh = head = page_buffers(page);
>  	block_start = 0;
>  	do {
> -		block_end = block_start+blocksize;
> -		if (block_end <= from)
> -			goto next_bh;
> -		if (block_start >= to)
> -			break;
> +		block_end = block_start + bh->b_size;
> +
>  		if (buffer_new(bh)) {
> -			void *kaddr;
> +			if (block_end > from && block_start < to) {
> +				if (!PageUptodate(page)) {
> +					unsigned start, end;
> +					void *kaddr;
> +
> +					start = max(from, block_start);
> +					end = min(to, block_end);
> +
> +					kaddr = kmap_atomic(page, KM_USER0);
> +					memset(kaddr+start, 0, block_end-end);
<<<<OOPS why you new block wasn't fully zeroed? as it was done before.
At least this result in information leak in case of (stat == from)
just imagine fs with blocksize == 1k conains file with i_size == 4096 and 
fist two blocks not mapped (hole), now invoke write op from 1023 to 2048.
For example we succeed in allocating first block, but faile while allocating second
, then we call page_zero_new_buffers(...from == 1023, to == 2048)
  and then zerro only one last byte for first block, and set is uptodate
After this we just do read( from == 0, to == 1023) and steal old block content.
 
> +					flush_dcache_page(page);
> +					kunmap_atomic(kaddr, KM_USER0);
> +					set_buffer_uptodate(bh);
> +				}
>  
> -			clear_buffer_new(bh);
> -			kaddr = kmap_atomic(page, KM_USER0);
> -			memset(kaddr+block_start, 0, bh->b_size);
> -			flush_dcache_page(page);
> -			kunmap_atomic(kaddr, KM_USER0);
> -			set_buffer_uptodate(bh);
> -			mark_buffer_dirty(bh);
> +				clear_buffer_new(bh);
> +				mark_buffer_dirty(bh);
> +			}
>  		}
> -next_bh:
> +
>  		block_start = block_end;
>  		bh = bh->b_this_page;
>  	} while (bh != head);
> -	return err;
>  }
>  
>  static int __block_commit_write(struct inode *inode, struct page *page,
> @@ -1899,6 +1913,7 @@ static int __block_commit_write(struct i
>  			set_buffer_uptodate(bh);
>  			mark_buffer_dirty(bh);
>  		}
> +		clear_buffer_new(bh);
>  	}
>  
>  	/*
> @@ -1912,6 +1927,93 @@ static int __block_commit_write(struct i
>  	return 0;
>  }
>  
> +int block_write_begin(struct file *file, struct address_space *mapping,
> +			loff_t pos, unsigned len, int intr,
> +			struct page **pagep, void **fsdata,
> +			get_block_t *get_block)
> +{
> +	struct inode *inode = mapping->host;
> +	int status = 0;
> +	struct page *page;
> +	pgoff_t index;
> +	unsigned start, end;
> +
> +	index = pos >> PAGE_CACHE_SHIFT;
> +	start = pos & (PAGE_CACHE_SIZE - 1);
> +	end = start + len;
> +
> +	page = *pagep;
> +	if (page == NULL) {
> +		page = __grab_cache_page(mapping, index);
> +		if (!page) {
> +			status = -ENOMEM;
> +			goto out;
> +		}
> +		*pagep = page;
> +	}
> +
> +	status = __block_prepare_write(inode, page, start, end, get_block);
> +	if (unlikely(status)) {
> +		ClearPageUptodate(page);
> +
> +		unlock_page(page);
> +		page_cache_release(page);
> +
> +		/*
> +		 * prepare_write() may have instantiated a few blocks
> +		 * outside i_size.  Trim these off again. Don't need
> +		 * i_size_read because we hold i_mutex.
> +		 */
> +		if (pos + len > inode->i_size)
> +			vmtruncate(inode, inode->i_size);
> +		goto out;
> +	}
> +
> +out:
> +	return status;
> +}
> +
> +int block_write_end(struct file *file, struct address_space *mapping,
> +			loff_t pos, unsigned len, unsigned copied,
> +			struct page *page, void *fsdata)
> +{
> +	struct inode *inode = mapping->host;
> +	unsigned start;
> +
> +	start = pos & (PAGE_CACHE_SIZE - 1);
> +
> +	if (unlikely(copied < len))
> +		page_zero_new_buffers(page, start+copied, start+len);
> +	flush_dcache_page(page);
> +
> +	/* This could be a short (even 0-length) commit */
> +	__block_commit_write(inode, page, start, start+copied);
> +
> +	/*
> +	 * The buffers that were written will now be uptodate, so we don't
> +	 * have to worry about a readpage reading them again.
> +	 * XXX: however I'm not sure about partial writes to buffers. Must
> +	 * recheck that, and hopefully we can remove the below test.
> +	 */
> +	if (!PageUptodate(page))
> +		copied = 0;
> +
> +	unlock_page(page);
> +	mark_page_accessed(page); /* XXX: put this in caller? */
> +	page_cache_release(page);
> +
> +	/*
> +	 * No need to use i_size_read() here, the i_size
> +	 * cannot change under us because we hold i_mutex.
> +	 */
> +	if (pos+copied > inode->i_size) {
> +		i_size_write(inode, pos+copied);
> +		mark_inode_dirty(inode);
> +	}
> +
> +	return copied;
> +}
> +
>  /*
>   * Generic "read page" function for block devices that have the normal
>   * get_block functionality. This is most of the block device filesystems.
> Index: linux-2.6/include/linux/buffer_head.h
> ===================================================================
> --- linux-2.6.orig/include/linux/buffer_head.h
> +++ linux-2.6/include/linux/buffer_head.h
> @@ -202,6 +202,13 @@ void block_invalidatepage(struct page *p
>  int block_write_full_page(struct page *page, get_block_t *get_block,
>  				struct writeback_control *wbc);
>  int block_read_full_page(struct page*, get_block_t*);
> +int block_write_begin(struct file *, struct address_space *,
> +				loff_t, unsigned, int,
> +				struct page **, void **, get_block_t*);
> +int block_write_end(struct file *, struct address_space *,
> +				loff_t, unsigned, unsigned,
> +				struct page *, void *);
> +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
>  int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
>  int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
>  				loff_t *);
> Index: linux-2.6/include/linux/pagemap.h
> ===================================================================
> --- linux-2.6.orig/include/linux/pagemap.h
> +++ linux-2.6/include/linux/pagemap.h
> @@ -85,6 +85,8 @@ unsigned find_get_pages_contig(struct ad
>  unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
>  			int tag, unsigned int nr_pages, struct page **pages);
>  
> +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
> +
>  /*
>   * Returns locked page at given index in given cache, creating it if needed.
>   */
> Index: linux-2.6/fs/libfs.c
> ===================================================================
> --- linux-2.6.orig/fs/libfs.c
> +++ linux-2.6/fs/libfs.c
> @@ -342,6 +342,24 @@ int simple_prepare_write(struct file *fi
>  	return 0;
>  }
>  
> +int simple_write_begin(struct file *file, struct address_space *mapping,
> +			loff_t pos, unsigned len, int intr,
> +			struct page **pagep, void **fsdata)
> +{
> +	struct page *page;
> +	pgoff_t index;
> +	unsigned from;
> +
> +	index = pos >> PAGE_CACHE_SHIFT;
> +	from = pos & (PAGE_CACHE_SIZE - 1);
> +
> +	page = __grab_cache_page(mapping, index);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	return simple_prepare_write(file, page, from, from+len);
> +}
> +
>  int simple_commit_write(struct file *file, struct page *page,
>  			unsigned from, unsigned to)
>  {
> @@ -360,6 +378,28 @@ int simple_commit_write(struct file *fil
>  	return 0;
>  }
>  
> +int simple_write_end(struct file *file, struct address_space *mapping,
> +			loff_t pos, unsigned len, unsigned copied,
> +			struct page *page, void *fsdata)
> +{
> +	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
> +
> +	/* zero the stale part of the page if we did a short copy */
> +	if (copied < len) {
> +		void *kaddr = kmap_atomic(page, KM_USER0);
> +		memset(kaddr + from + copied, 0, len - copied);
> +		flush_dcache_page(page);
> +		kunmap_atomic(kaddr, KM_USER0);
> +	}
> +
> +	simple_commit_write(file, page, from, from+copied);
> +
> +	unlock_page(page);
> +	page_cache_release(page);
> +
> +	return copied;
> +}
> +
>  int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
>  {
>  	struct inode *inode;
> Index: linux-2.6/drivers/block/loop.c
> ===================================================================
> --- linux-2.6.orig/drivers/block/loop.c
> +++ linux-2.6/drivers/block/loop.c
> @@ -206,11 +206,10 @@ lo_do_transfer(struct loop_device *lo, i
>   * space operations prepare_write and commit_write.
>   */
>  static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
> -		int bsize, loff_t pos, struct page *page)
> +		int bsize, loff_t pos, struct page *unused)
>  {
>  	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
>  	struct address_space *mapping = file->f_mapping;
> -	const struct address_space_operations *aops = mapping->a_ops;
>  	pgoff_t index;
>  	unsigned offset, bv_offs;
>  	int len, ret;
> @@ -222,67 +221,47 @@ static int do_lo_send_aops(struct loop_d
>  	len = bvec->bv_len;
>  	while (len > 0) {
>  		sector_t IV;
> -		unsigned size;
> +		unsigned size, copied;
>  		int transfer_result;
> +		struct page *page;
> +		void *fsdata;
>  
>  		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
>  		size = PAGE_CACHE_SIZE - offset;
>  		if (size > len)
>  			size = len;
> -		page = grab_cache_page(mapping, index);
> -		if (unlikely(!page))
> +
> +		ret = pagecache_write_begin(file, mapping, pos, size, 1,
> +							&page, &fsdata);
> +		if (ret)
>  			goto fail;
> -		ret = aops->prepare_write(file, page, offset,
> -					  offset + size);
> -		if (unlikely(ret)) {
> -			if (ret == AOP_TRUNCATED_PAGE) {
> -				page_cache_release(page);
> -				continue;
> -			}
> -			goto unlock;
> -		}
> +
>  		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
>  				bvec->bv_page, bv_offs, size, IV);
> -		if (unlikely(transfer_result)) {
> -			char *kaddr;
> +		copied = size;
> +		if (unlikely(transfer_result))
> +			copied = 0;
> +
> +		ret = pagecache_write_end(file, mapping, pos, size, copied,
> +							page, fsdata);
> +		if (ret < 0)
> +			goto fail;
> +		if (ret < copied)
> +			copied = ret;
>  
> -			/*
> -			 * The transfer failed, but we still write the data to
> -			 * keep prepare/commit calls balanced.
> -			 */
> -			printk(KERN_ERR "loop: transfer error block %llu\n",
> -			       (unsigned long long)index);
> -			kaddr = kmap_atomic(page, KM_USER0);
> -			memset(kaddr + offset, 0, size);
> -			kunmap_atomic(kaddr, KM_USER0);
> -		}
> -		flush_dcache_page(page);
> -		ret = aops->commit_write(file, page, offset,
> -					 offset + size);
> -		if (unlikely(ret)) {
> -			if (ret == AOP_TRUNCATED_PAGE) {
> -				page_cache_release(page);
> -				continue;
> -			}
> -			goto unlock;
<<<<<< BTW: do_lo_send_aops() code itself is realy ugly, for example patrial 
             write not supported. 
> -		}
>  		if (unlikely(transfer_result))
> -			goto unlock;
> -		bv_offs += size;
> -		len -= size;
> +			goto fail;
> +
> +		bv_offs += copied;
> +		len -= copied;
>  		offset = 0;
>  		index++;
> -		pos += size;
> -		unlock_page(page);
> -		page_cache_release(page);
> +		pos += copied;
>  	}
>  	ret = 0;
>  out:
>  	mutex_unlock(&mapping->host->i_mutex);
>  	return ret;
> -unlock:
> -	unlock_page(page);
> -	page_cache_release(page);
>  fail:
>  	ret = -1;
>  	goto out;
> Index: linux-2.6/fs/namei.c
> ===================================================================
> --- linux-2.6.orig/fs/namei.c
> +++ linux-2.6/fs/namei.c
> @@ -2688,53 +2688,30 @@ int __page_symlink(struct inode *inode, 
>  {
>  	struct address_space *mapping = inode->i_mapping;
>  	struct page *page;
> +	void *fsdata;
>  	int err;
>  	char *kaddr;
>  
>  retry:
> -	err = -ENOMEM;
> -	page = find_or_create_page(mapping, 0, gfp_mask);
> -	if (!page)
> -		goto fail;
> -	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
> -	if (err == AOP_TRUNCATED_PAGE) {
> -		page_cache_release(page);
> -		goto retry;
> -	}
> +	err = pagecache_write_begin(NULL, mapping, 0, PAGE_CACHE_SIZE, 0,
> +							&page, &fsdata);
>  	if (err)
> -		goto fail_map;
> +		goto fail;
> +
>  	kaddr = kmap_atomic(page, KM_USER0);
>  	memcpy(kaddr, symname, len-1);
> +	memset(kaddr+len-1, 0, PAGE_CACHE_SIZE-(len-1));
>  	kunmap_atomic(kaddr, KM_USER0);
> -	err = mapping->a_ops->commit_write(NULL, page, 0, len-1);
> -	if (err == AOP_TRUNCATED_PAGE) {
> -		page_cache_release(page);
> -		goto retry;
> -	}
> -	if (err)
> -		goto fail_map;
> -	/*
> -	 * Notice that we are _not_ going to block here - end of page is
> -	 * unmapped, so this will only try to map the rest of page, see
> -	 * that it is unmapped (typically even will not look into inode -
> -	 * ->i_size will be enough for everything) and zero it out.
> -	 * OTOH it's obviously correct and should make the page up-to-date.
> -	 */
> -	if (!PageUptodate(page)) {
> -		err = mapping->a_ops->readpage(NULL, page);
> -		if (err != AOP_TRUNCATED_PAGE)
> -			wait_on_page_locked(page);
> -	} else {
> -		unlock_page(page);
> -	}
> -	page_cache_release(page);
> +
> +	err = pagecache_write_end(NULL, mapping, 0, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE,
> +							page, fsdata);
>  	if (err < 0)
>  		goto fail;
> +	if (err < PAGE_CACHE_SIZE)
> +		goto retry;
> +
>  	mark_inode_dirty(inode);
>  	return 0;
> -fail_map:
> -	unlock_page(page);
> -	page_cache_release(page);
>  fail:
>  	return err;
>  }
> Index: linux-2.6/fs/splice.c
> ===================================================================
> --- linux-2.6.orig/fs/splice.c
> +++ linux-2.6/fs/splice.c
> @@ -559,7 +559,7 @@ static int pipe_to_file(struct pipe_inod
>  	struct address_space *mapping = file->f_mapping;
>  	unsigned int offset, this_len;
>  	struct page *page;
> -	pgoff_t index;
> +	void *fsdata;
>  	int ret;
>  
>  	/*
> @@ -569,13 +569,13 @@ static int pipe_to_file(struct pipe_inod
>  	if (unlikely(ret))
>  		return ret;
>  
> -	index = sd->pos >> PAGE_CACHE_SHIFT;
>  	offset = sd->pos & ~PAGE_CACHE_MASK;
>  
>  	this_len = sd->len;
>  	if (this_len + offset > PAGE_CACHE_SIZE)
>  		this_len = PAGE_CACHE_SIZE - offset;
>  
> +#if 0
>  	/*
>  	 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
>  	 * page.
> @@ -587,86 +587,11 @@ static int pipe_to_file(struct pipe_inod
>  		 * locked on successful return.
>  		 */
>  		if (buf->ops->steal(pipe, buf))
> -			goto find_page;
> +#endif
>  
> -		page = buf->page;
> -		if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) {
> -			unlock_page(page);
> -			goto find_page;
> -		}
> -
> -		page_cache_get(page);
> -
> -		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
> -			lru_cache_add(page);
> -	} else {
> -find_page:
> -		page = find_lock_page(mapping, index);
> -		if (!page) {
> -			ret = -ENOMEM;
> -			page = page_cache_alloc_cold(mapping);
> -			if (unlikely(!page))
> -				goto out_ret;
> -
> -			/*
> -			 * This will also lock the page
> -			 */
> -			ret = add_to_page_cache_lru(page, mapping, index,
> -						    GFP_KERNEL);
> -			if (unlikely(ret))
> -				goto out;
> -		}
> -
> -		/*
> -		 * We get here with the page locked. If the page is also
> -		 * uptodate, we don't need to do more. If it isn't, we
> -		 * may need to bring it in if we are not going to overwrite
> -		 * the full page.
> -		 */
> -		if (!PageUptodate(page)) {
> -			if (this_len < PAGE_CACHE_SIZE) {
> -				ret = mapping->a_ops->readpage(file, page);
> -				if (unlikely(ret))
> -					goto out;
> -
> -				lock_page(page);
> -
> -				if (!PageUptodate(page)) {
> -					/*
> -					 * Page got invalidated, repeat.
> -					 */
> -					if (!page->mapping) {
> -						unlock_page(page);
> -						page_cache_release(page);
> -						goto find_page;
> -					}
> -					ret = -EIO;
> -					goto out;
> -				}
> -			} else
> -				SetPageUptodate(page);
> -		}
> -	}
> -
> -	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
> -	if (unlikely(ret)) {
> -		loff_t isize = i_size_read(mapping->host);
> -
> -		if (ret != AOP_TRUNCATED_PAGE)
> -			unlock_page(page);
> -		page_cache_release(page);
> -		if (ret == AOP_TRUNCATED_PAGE)
> -			goto find_page;
> -
> -		/*
> -		 * prepare_write() may have instantiated a few blocks
> -		 * outside i_size.  Trim these off again.
> -		 */
> -		if (sd->pos + this_len > isize)
> -			vmtruncate(mapping->host, isize);
> -
> -		goto out_ret;
> -	}
> +	ret = pagecache_write_begin(file, mapping, sd->pos, sd->len, 0, &page, &fsdata);
> +	if (unlikely(ret))
> +		goto out;
>  
>  	if (buf->page != page) {
>  		/*
> @@ -676,28 +601,13 @@ find_page:
>  		char *dst = kmap_atomic(page, KM_USER1);
>  
>  		memcpy(dst + offset, src + buf->offset, this_len);
> -		flush_dcache_page(page);
>  		kunmap_atomic(dst, KM_USER1);
>  		buf->ops->unmap(pipe, buf, src);
>  	}
>  
> -	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
> -	if (!ret) {
> -		/*
> -		 * Return the number of bytes written and mark page as
> -		 * accessed, we are now done!
> -		 */
> -		ret = this_len;
> -		mark_page_accessed(page);
> -		balance_dirty_pages_ratelimited(mapping);
> -	} else if (ret == AOP_TRUNCATED_PAGE) {
> -		page_cache_release(page);
> -		goto find_page;
> -	}
> +	ret = pagecache_write_end(file, mapping, sd->pos, sd->len, sd->len, page, fsdata);
> +
>  out:
> -	page_cache_release(page);
> -	unlock_page(page);
> -out_ret:
>  	return ret;
>  }
>  
> Index: linux-2.6/Documentation/filesystems/Locking
> ===================================================================
> --- linux-2.6.orig/Documentation/filesystems/Locking
> +++ linux-2.6/Documentation/filesystems/Locking
> @@ -176,15 +176,18 @@ prototypes:
>  locking rules:
>  	All except set_page_dirty may block
>  
> -			BKL	PageLocked(page)
> +			BKL	PageLocked(page)	i_sem
>  writepage:		no	yes, unlocks (see below)
>  readpage:		no	yes, unlocks
>  sync_page:		no	maybe
>  writepages:		no
>  set_page_dirty		no	no
>  readpages:		no
> -prepare_write:		no	yes
> -commit_write:		no	yes
> +prepare_write:		no	yes			yes
> +commit_write:		no	yes			yes
> +write_begin:		no	locks the page		yes
> +write_end:		no	yes, unlocks		yes
Ohhh and now i'm totaly shure what you just forgot unlocking stuff
in pagecache_write_end()
> +perform_write:		no	n/a			yes
>  bmap:			yes
>  invalidatepage:		no	yes
>  releasepage:		no	yes
> Index: linux-2.6/Documentation/filesystems/vfs.txt
> ===================================================================
> --- linux-2.6.orig/Documentation/filesystems/vfs.txt
> +++ linux-2.6/Documentation/filesystems/vfs.txt
> @@ -534,6 +534,14 @@ struct address_space_operations {
>  			struct list_head *pages, unsigned nr_pages);
>  	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
>  	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> +	int (*write_begin)(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, int intr,
> +				struct page **pagep, void **fsdata);
> +	int (*write_end)(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, unsigned copied,
> +				struct page *page, void *fsdata);
> +	int (*perform_write)(struct file *, struct address_space *,
> +				struct iov_iter *, loff_t);
>  	sector_t (*bmap)(struct address_space *, sector_t);
>  	int (*invalidatepage) (struct page *, unsigned long);
>  	int (*releasepage) (struct page *, int);
> @@ -629,6 +637,36 @@ struct address_space_operations {
>          operations.  It should avoid returning an error if possible -
>          errors should have been handled by prepare_write.
>  
> +  write_begin: This is intended as a replacement for prepare_write. Called
> +        by the generic buffered write code to ask the filesystem to prepare
> +        to write len bytes at the given offset in the file. intr is a boolean
> +        that specifies whether or not the filesystem must be prepared to
> +        handle a short write (ie. write_end called with less than len bytes
> +        written).
> +
> +        The filesystem must return the locked pagecache page for the caller
> +        to write into.
> +
> +        A void * may be returned in fsdata, which then gets passed into
> +        write_end.
> +
> +        Returns < 0 on failure, in which case all cleanup must be done and
> +        write_end not called. 0 on success, in which case write_end must
> +        be called.
> +
> +  write_end: After a successful write_begin, and data copy, write_end must
> +        be called. len is the original len passed to write_begin, and copied
> +        is the amount that was able to be copied (they must be equal if
> +        write_begin was called with intr == 0).
> +
> +        The filesystem must take care of unlocking the page and dropping its
> +        refcount, and updating i_size.
> +
> +        Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
> +        that were able to be copied into the file.
> +
> +  perform_write:
> +
>    bmap: called by the VFS to map a logical block offset within object to
>    	physical block number. This method is used by the FIBMAP
>    	ioctl and for working with swap-files.  To be able to swap to
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-14 21:28   ` Dmitriy Monakhov
@ 2007-03-15  3:55     ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-15  3:55 UTC (permalink / raw)
  To: Dmitriy Monakhov
  Cc: Linux Filesystems, Linux Kernel, Christoph Hellwig, Andrew Morton,
	Mark Fasheh

On Thu, Mar 15, 2007 at 12:28:04AM +0300, Dmitriy Monakhov wrote:
> Nick Piggin <npiggin@suse.de> writes:
> 
> > +
> > +int pagecache_write_end(struct file *file, struct address_space *mapping,
> > +				loff_t pos, unsigned len, unsigned copied,
> > +				struct page *page, void *fsdata)
> > +{
> > +	const struct address_space_operations *aops = mapping->a_ops;
> > +	int ret;
> > +
> > +	if (aops->write_begin)
> > +		ret = aops->write_end(file, mapping, pos, len, copied, page, fsdata);
> > +	else {
> > +		int ret;
> > +		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
> > +		struct inode *inode = mapping->host;
> > +
> > +		flush_dcache_page(page);
> > +		ret = aops->commit_write(file, page, offset, offset+len);
> > +		if (ret < 0) {
> > +			unlock_page(page);
> > +			page_cache_release(page);
> > +			if (pos + len > inode->i_size)
> > +				vmtruncate(inode, inode->i_size);
> > +		} else
> > +			ret = copied;
> What about AOP_TRUNCATED_PAGE?  Off corse we can't just "goto retry" here :) ,
> but we may return it to caller and let's caller handle it.  

Yeah AOP_TRUNCATED_PAGE... I'm _hoping_ that OCFS2 and GFS2 will be able
to avoid that using write_begin/write_end, so the caller will not have to
know anything about it.

I don't know that commit_write can even return AOP_TRUNCATED_PAGE... we
should have gathered all our locks in prepare_write.


> > +	}
> > +
> > +	return copied;
> if ->commit_write return non negative value we return with sill locked page  look above at [1] 
> may be it will be unlocked by caller? I guess no it was just forgoten.

Yeah, thanks. I think I converted all my filesystems to use write_begin /
write_end, so I probably didn't test this path :P. I do plan to go through
and try to individually test error cases and stress test it over the next
couple of days.


> > +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
> > +{
> > +	unsigned int block_start, block_end;
> > +	struct buffer_head *head, *bh;
> > +
> > +	BUG_ON(!PageLocked(page));
> > +	if (!page_has_buffers(page))
> > +		return;__block_prepare_write 
> > +
> > +	bh = head = page_buffers(page);
> >  	block_start = 0;
> >  	do {
> > -		block_end = block_start+blocksize;
> > -		if (block_end <= from)
> > -			goto next_bh;
> > -		if (block_start >= to)
> > -			break;
> > +		block_end = block_start + bh->b_size;
> > +
> >  		if (buffer_new(bh)) {
> > -			void *kaddr;
> > +			if (block_end > from && block_start < to) {
> > +				if (!PageUptodate(page)) {
> > +					unsigned start, end;
> > +					void *kaddr;
> > +
> > +					start = max(from, block_start);
> > +					end = min(to, block_end);
> > +
> > +					kaddr = kmap_atomic(page, KM_USER0);
> > +					memset(kaddr+start, 0, block_end-end);
> <<<<OOPS why you new block wasn't fully zeroed? as it was done before.
> At least this result in information leak in case of (stat == from)
> just imagine fs with blocksize == 1k conains file with i_size == 4096 and 
> fist two blocks not mapped (hole), now invoke write op from 1023 to 2048.
> For example we succeed in allocating first block, but faile while allocating second
> , then we call page_zero_new_buffers(...from == 1023, to == 2048)
>   and then zerro only one last byte for first block, and set is uptodate
> After this we just do read( from == 0, to == 1023) and steal old block content.

When we first invoke the write op, it should see were doing a partial
write into the first buffer and bring it uptodate first. I don't see the
problem, but again I do need to go through and exercise various cases
like this.


> > @@ -222,67 +221,47 @@ static int do_lo_send_aops(struct loop_d
> >  	len = bvec->bv_len;
> >  	while (len > 0) {
> >  		sector_t IV;
> > -		unsigned size;
> > +		unsigned size, copied;
> >  		int transfer_result;
> > +		struct page *page;
> > +		void *fsdata;
> >  
> >  		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
> >  		size = PAGE_CACHE_SIZE - offset;
> >  		if (size > len)
> >  			size = len;
> > -		page = grab_cache_page(mapping, index);
> > -		if (unlikely(!page))
> > +
> > +		ret = pagecache_write_begin(file, mapping, pos, size, 1,
> > +							&page, &fsdata);
> > +		if (ret)
> >  			goto fail;
> > -		ret = aops->prepare_write(file, page, offset,
> > -					  offset + size);
> > -		if (unlikely(ret)) {
> > -			if (ret == AOP_TRUNCATED_PAGE) {
> > -				page_cache_release(page);
> > -				continue;
> > -			}
> > -			goto unlock;
> > -		}
> > +
> >  		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
> >  				bvec->bv_page, bv_offs, size, IV);
> > -		if (unlikely(transfer_result)) {
> > -			char *kaddr;
> > +		copied = size;
> > +		if (unlikely(transfer_result))
> > +			copied = 0;
> > +
> > +		ret = pagecache_write_end(file, mapping, pos, size, copied,
> > +							page, fsdata);
> > +		if (ret < 0)
> > +			goto fail;
> > +		if (ret < copied)
> > +			copied = ret;
> >  
> > -			/*
> > -			 * The transfer failed, but we still write the data to
> > -			 * keep prepare/commit calls balanced.
> > -			 */
> > -			printk(KERN_ERR "loop: transfer error block %llu\n",
> > -			       (unsigned long long)index);
> > -			kaddr = kmap_atomic(page, KM_USER0);
> > -			memset(kaddr + offset, 0, size);
> > -			kunmap_atomic(kaddr, KM_USER0);
> > -		}
> > -		flush_dcache_page(page);
> > -		ret = aops->commit_write(file, page, offset,
> > -					 offset + size);
> > -		if (unlikely(ret)) {
> > -			if (ret == AOP_TRUNCATED_PAGE) {
> > -				page_cache_release(page);
> > -				continue;
> > -			}
> > -			goto unlock;
> <<<<<< BTW: do_lo_send_aops() code itself is realy ugly, for example patrial 
>              write not supported. 

Well, the new code just works. The old code tried to support partial writes,
but it ends up just zeroing out the rest of the unwritten part of the page,
so it can actually overwrite real data.

This is one of the problems with prepare_write/commit_write: error handling
is fairly complex, and leaving it up to the callers is just insane because
we can't even get it right in mm/ :P


Anyway, thanks for having a look over it. You definitely spotted a bug
with the page locking.

Nick

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
       [not found]   ` <200703142246.27167.m.kozlowski@tuxland.pl>
@ 2007-03-15  3:58     ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-15  3:58 UTC (permalink / raw)
  To: Mariusz Kozlowski
  Cc: Linux Filesystems, Linux Kernel, Christoph Hellwig, Andrew Morton,
	Mark Fasheh

On Wed, Mar 14, 2007 at 10:46:25PM +0100, Mariusz Kozlowski wrote:
> Hello, 
> 
> 	I guess no need to define 'ret' twice here.

[...]

Hi Mariusz,

Thanks, I'll clean that up.

Nick

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-14 13:38 ` [patch 2/5] fs: introduce new aops and infrastructure Nick Piggin
  2007-03-14 21:28   ` Dmitriy Monakhov
       [not found]   ` <200703142246.27167.m.kozlowski@tuxland.pl>
@ 2007-03-15  4:13   ` Mark Fasheh
  2007-03-15  4:36     ` Nick Piggin
  2007-03-15 20:06     ` Trond Myklebust
  2007-03-15  9:44   ` Dmitriy Monakhov
  3 siblings, 2 replies; 19+ messages in thread
From: Mark Fasheh @ 2007-03-15  4:13 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linux Filesystems, Linux Kernel, Christoph Hellwig, Andrew Morton

Hi Nick,

On Wed, Mar 14, 2007 at 02:38:22PM +0100, Nick Piggin wrote:
> Introduce write_begin, write_end, and perform_write aops.
> 
> These are intended to replace prepare_write and commit_write with more
> flexible alternatives that are also able to avoid the buffered write
> deadlock problems efficiently (which prepare_write is unable to do).

> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h
> +++ linux-2.6/include/linux/fs.h
> @@ -449,6 +449,17 @@ struct address_space_operations {
>  	 */
>  	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
>  	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> +
> +	int (*write_begin)(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, int intr,
> +				struct page **pagep, void **fsdata);
> +	int (*write_end)(struct file *, struct address_space *mapping,
> +				loff_t pos, unsigned len, unsigned copied,
> +				struct page *page, void *fsdata);

Are we going to get rid of the file and intr arguments btw? I'm not sure
intr is useful, and mapping is probably enough to get whatever we inside
->write_begin / ->write_end.

Also, I noticed that you didn't export block_write_begin(),
simple_write_begin(), block_write_end() and simple_write_end() - I think we
want those for client modules.


Attached is a quick patch to hook up the existing ocfs2 write code. This has
been compile tested only for now - one of my test machines isn't
cooperating, so a runtime test will have to wait until tommorrow.

One interesting side effect is that we no longer pass AOP_TRUNCATE_PAGE up a
level. This gives callers less to deal with. And it means that ocfs2 doesn't
have to use the ocfs2_*_lock_with_page() cluster lock variants in
ocfs2_block_write_begin() because it can order cluster locks outside of the
page lock there.

My ocfs2 write rework will be a more serious user of these stuff, including
the fsdata backpointer. That code will also eliminate the entire
ocfs2_*_lock_with_page() cluster locking workarounds for write (they'll have
to remain for ->readpage()). I'm beginning work on cleaning those ocfs2
patches up and getting them plugged into this stuff. I hope to post them in
the next day or two.
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

ocfs2: Convert to new aops

Turn ocfs2_prepare_write() and ocfs2_commit_write() into ocfs2_write_begin()
and ocfs2_write_end().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 93628b0..e7bcbbd 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -293,29 +293,30 @@ int ocfs2_prepare_write_nolock(struct in
 }
 
 /*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback.  It must be able to perform its own locking around
- * ocfs2_get_block().
+ * ocfs2_write_begin() can be an outer-most ocfs2 call when it is
+ * called from elsewhere in the kernel. It must be able to perform its
+ * own locking around ocfs2_get_block().
  */
-static int ocfs2_prepare_write(struct file *file, struct page *page,
-			       unsigned from, unsigned to)
+static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, int intr,
+			     struct page **pagep, void **fsdata)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = mapping->host;
 	int ret;
 
-	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
-	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
+	ret = ocfs2_meta_lock(inode, NULL, 0);
 	if (ret != 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = block_write_begin(file, mapping, pos, len, intr, pagep, fsdata,
+				ocfs2_get_block);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
 
 	ocfs2_meta_unlock(inode, 0);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -388,16 +389,21 @@ out:
 	return handle;
 }
 
-static int ocfs2_commit_write(struct file *file, struct page *page,
-			      unsigned from, unsigned to)
+static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
 {
 	int ret;
+	unsigned from, to;
 	struct buffer_head *di_bh = NULL;
 	struct inode *inode = page->mapping->host;
 	handle_t *handle = NULL;
 	struct ocfs2_dinode *di;
 
-	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
+	mlog_entry("(0x%p, 0x%p)\n", file, page);
+
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
 
 	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
 	 * us to continue here without rechecking the I/O against
@@ -441,8 +447,10 @@ static int ocfs2_commit_write(struct fil
 	}
 
 	/* might update i_size */
-	ret = generic_commit_write(file, page, from, to);
-	if (ret < 0) {
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (copied < 0) {
+		ret = copied;
+		copied = 0;
 		mlog_errno(ret);
 		goto out_commit;
 	}
@@ -458,11 +466,10 @@ static int ocfs2_commit_write(struct fil
 	di->i_size = cpu_to_le64((u64)i_size_read(inode));
 
 	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret < 0) {
+	if (ret < 0)
 		mlog_errno(ret);
-		goto out_commit;
-	}
 
+	ret = 0;
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out_unlock_data:
@@ -470,11 +477,29 @@ out_unlock_data:
 out_unlock_meta:
 	ocfs2_meta_unlock(inode, 1);
 out:
+	if (ret && ret != AOP_TRUNCATED_PAGE) {
+		/*
+		 * We caught an error before block_write_end() -
+		 * unlock and free the page.
+		 */
+		unlock_page(page);
+		page_cache_release(page);
+	} else if (ret == AOP_TRUNCATED_PAGE) {
+		/*
+		 * The cluster locking code avoided a deadlock by
+		 * unlocking the page - simply release here and turn
+		 * the error into something negative for the caller to
+		 * key on.
+		 */
+		page_cache_release(page);
+		ret = -EAGAIN;
+	}
+
 	if (di_bh)
 		brelse(di_bh);
 
 	mlog_exit(ret);
-	return ret;
+	return copied ? copied : ret;
 }
 
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
@@ -657,8 +682,8 @@ out:
 const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
-	.prepare_write	= ocfs2_prepare_write,
-	.commit_write	= ocfs2_commit_write,
+	.write_begin	= ocfs2_write_begin,
+	.write_end	= ocfs2_write_end,
 	.bmap		= ocfs2_bmap,
 	.sync_page	= block_sync_page,
 	.direct_IO	= ocfs2_direct_IO

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15  4:13   ` Mark Fasheh
@ 2007-03-15  4:36     ` Nick Piggin
  2007-03-15  6:11       ` Mark Fasheh
                         ` (2 more replies)
  2007-03-15 20:06     ` Trond Myklebust
  1 sibling, 3 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-15  4:36 UTC (permalink / raw)
  To: Mark Fasheh
  Cc: Linux Filesystems, Linux Kernel, Christoph Hellwig, Andrew Morton

On Wed, Mar 14, 2007 at 09:13:29PM -0700, Mark Fasheh wrote:
> Hi Nick,
> 
> On Wed, Mar 14, 2007 at 02:38:22PM +0100, Nick Piggin wrote:
> > Introduce write_begin, write_end, and perform_write aops.
> > 
> > These are intended to replace prepare_write and commit_write with more
> > flexible alternatives that are also able to avoid the buffered write
> > deadlock problems efficiently (which prepare_write is unable to do).
> 
> > Index: linux-2.6/include/linux/fs.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/fs.h
> > +++ linux-2.6/include/linux/fs.h
> > @@ -449,6 +449,17 @@ struct address_space_operations {
> >  	 */
> >  	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
> >  	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> > +
> > +	int (*write_begin)(struct file *, struct address_space *mapping,
> > +				loff_t pos, unsigned len, int intr,
> > +				struct page **pagep, void **fsdata);
> > +	int (*write_end)(struct file *, struct address_space *mapping,
> > +				loff_t pos, unsigned len, unsigned copied,
> > +				struct page *page, void *fsdata);
> 
> Are we going to get rid of the file and intr arguments btw? I'm not sure
> intr is useful, and mapping is probably enough to get whatever we inside
> ->write_begin / ->write_end.

Yeah, I was going to, but I had this version ready to go so decided
to leave them in at the last minute. We can definitely take them out
if people agree.

However a side note about intr -- I wonder if it might be wise to
include a flags argument, in case we might want to add something like
that later? (definitely if we do keep intr, then it should be done as
a flag rather than its own int).


> Also, I noticed that you didn't export block_write_begin(),
> simple_write_begin(), block_write_end() and simple_write_end() - I think we
> want those for client modules.

Yep, simple oversight on my part.


> Attached is a quick patch to hook up the existing ocfs2 write code. This has
> been compile tested only for now - one of my test machines isn't
> cooperating, so a runtime test will have to wait until tommorrow.
> 
> One interesting side effect is that we no longer pass AOP_TRUNCATE_PAGE up a
> level. This gives callers less to deal with. And it means that ocfs2 doesn't
> have to use the ocfs2_*_lock_with_page() cluster lock variants in
> ocfs2_block_write_begin() because it can order cluster locks outside of the
> page lock there.

OK that's very cool. I was hoping that would be the case. If GFS2 can
avoid that too, then we might be able to get rid of AOP_TRUNCATE_PAGE
handling from the legacy prepare/commit_write paths, which will make
them simpler.

> My ocfs2 write rework will be a more serious user of these stuff, including
> the fsdata backpointer. That code will also eliminate the entire
> ocfs2_*_lock_with_page() cluster locking workarounds for write (they'll have
> to remain for ->readpage()). I'm beginning work on cleaning those ocfs2
> patches up and getting them plugged into this stuff. I hope to post them in
> the next day or two.

OK, well I'll add this to my queue for now, and post the full patchset
after incorporating feedback I've had so far, and doing more testing,
so people can actually apply them and boot kernels.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15  4:36     ` Nick Piggin
@ 2007-03-15  6:11       ` Mark Fasheh
  2007-03-15  6:23       ` Joel Becker
  2007-03-15 16:24       ` Steven Whitehouse
  2 siblings, 0 replies; 19+ messages in thread
From: Mark Fasheh @ 2007-03-15  6:11 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linux Filesystems, Linux Kernel, Christoph Hellwig, Andrew Morton

On Thu, Mar 15, 2007 at 05:36:42AM +0100, Nick Piggin wrote:
> > Are we going to get rid of the file and intr arguments btw? I'm not sure
> > intr is useful, and mapping is probably enough to get whatever we inside
> > ->write_begin / ->write_end.
> 
> Yeah, I was going to, but I had this version ready to go so decided
> to leave them in at the last minute. We can definitely take them out
> if people agree.
> 
> However a side note about intr -- I wonder if it might be wise to
> include a flags argument, in case we might want to add something like
> that later? (definitely if we do keep intr, then it should be done as
> a flag rather than its own int).

I don't see a problem with having a flags argument. It could give us some
flexibility in the future which would otherwise require a much bigger
update. If we found out that we needed intr, it could just be a flag.


> > One interesting side effect is that we no longer pass AOP_TRUNCATE_PAGE up a
> > level. This gives callers less to deal with. And it means that ocfs2 doesn't
> > have to use the ocfs2_*_lock_with_page() cluster lock variants in
> > ocfs2_block_write_begin() because it can order cluster locks outside of the
> > page lock there.
> 
> OK that's very cool. I was hoping that would be the case. If GFS2 can
> avoid that too, then we might be able to get rid of AOP_TRUNCATE_PAGE
> handling from the legacy prepare/commit_write paths, which will make
> them simpler.

Yeah - so long as we're not taking a page fault between write_begin /
write_end, there's no reason for the cluster locks to be taken and dropped
within the individual callbacks, which means we can just take them in
write_begin (where page lock ordering is possible) and hold them until
write_end is called.


> OK, well I'll add this to my queue for now, and post the full patchset
> after incorporating feedback I've had so far, and doing more testing,
> so people can actually apply them and boot kernels.

Great, thanks - it just occured to me that I should be holding the clusters
locks across the entire copy (as I point out above), so I'll have a slightly
updated version of this patch for you soon :)
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15  4:36     ` Nick Piggin
  2007-03-15  6:11       ` Mark Fasheh
@ 2007-03-15  6:23       ` Joel Becker
  2007-03-15  8:04         ` Nick Piggin
  2007-03-15 16:24       ` Steven Whitehouse
  2 siblings, 1 reply; 19+ messages in thread
From: Joel Becker @ 2007-03-15  6:23 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Mark Fasheh, Linux Filesystems, Linux Kernel, Christoph Hellwig,
	Andrew Morton

On Thu, Mar 15, 2007 at 05:36:42AM +0100, Nick Piggin wrote:
> On Wed, Mar 14, 2007 at 09:13:29PM -0700, Mark Fasheh wrote:
> > Are we going to get rid of the file and intr arguments btw? I'm not sure
> > intr is useful, and mapping is probably enough to get whatever we inside
> > ->write_begin / ->write_end.
> 
> Yeah, I was going to, but I had this version ready to go so decided
> to leave them in at the last minute. We can definitely take them out
> if people agree.

	You're really going to need the file argument around.  Some
folks care about file->private_data, etc.  A good example is
nfs_updatepage() from nfs_commit_write().  There's a context on the
filp.  Mapping can get back to the inode via ->host, but not to the
struct file.

Joel

-- 

Life's Little Instruction Book #157 

	"Take time to smell the roses."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker@oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15  6:23       ` Joel Becker
@ 2007-03-15  8:04         ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-15  8:04 UTC (permalink / raw)
  To: Mark Fasheh, Linux Filesystems, Linux Kernel, Christoph Hellwig,
	Andrew Morton

On Wed, Mar 14, 2007 at 11:23:05PM -0700, Joel Becker wrote:
> On Thu, Mar 15, 2007 at 05:36:42AM +0100, Nick Piggin wrote:
> > On Wed, Mar 14, 2007 at 09:13:29PM -0700, Mark Fasheh wrote:
> > > Are we going to get rid of the file and intr arguments btw? I'm not sure
> > > intr is useful, and mapping is probably enough to get whatever we inside
> > > ->write_begin / ->write_end.
> > 
> > Yeah, I was going to, but I had this version ready to go so decided
> > to leave them in at the last minute. We can definitely take them out
> > if people agree.
> 
> 	You're really going to need the file argument around.  Some
> folks care about file->private_data, etc.  A good example is
> nfs_updatepage() from nfs_commit_write().  There's a context on the
> filp.  Mapping can get back to the inode via ->host, but not to the
> struct file.

OK, I'll keep the file around unless we see a better alternative.
Thanks for pointing that out.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-14 13:38 ` [patch 2/5] fs: introduce new aops and infrastructure Nick Piggin
                     ` (2 preceding siblings ...)
  2007-03-15  4:13   ` Mark Fasheh
@ 2007-03-15  9:44   ` Dmitriy Monakhov
  2007-03-15 10:04     ` Nick Piggin
  3 siblings, 1 reply; 19+ messages in thread
From: Dmitriy Monakhov @ 2007-03-15  9:44 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linux Filesystems, Linux Kernel, Christoph Hellwig, Andrew Morton,
	Mark Fasheh

Nick Piggin <npiggin@suse.de> writes:

> Index: linux-2.6/fs/splice.c
> ===================================================================
> --- linux-2.6.orig/fs/splice.c
> +++ linux-2.6/fs/splice.c
> @@ -559,7 +559,7 @@ static int pipe_to_file(struct pipe_inod
>  	struct address_space *mapping = file->f_mapping;
>  	unsigned int offset, this_len;
>  	struct page *page;
> -	pgoff_t index;
> +	void *fsdata;
>  	int ret;
>  
>  	/*
> @@ -569,13 +569,13 @@ static int pipe_to_file(struct pipe_inod
>  	if (unlikely(ret))
>  		return ret;
>  
> -	index = sd->pos >> PAGE_CACHE_SHIFT;
>  	offset = sd->pos & ~PAGE_CACHE_MASK;
>  
>  	this_len = sd->len;
>  	if (this_len + offset > PAGE_CACHE_SIZE)
>  		this_len = PAGE_CACHE_SIZE - offset;
>  
> +#if 0
>  	/*
>  	 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
>  	 * page.
> @@ -587,86 +587,11 @@ static int pipe_to_file(struct pipe_inod
>  		 * locked on successful return.
>  		 */
>  		if (buf->ops->steal(pipe, buf))
> -			goto find_page;
> +#endif
One more note. It's looks like you just disabled all fancy zero copy logic.
Off corse this is just rfc patchset.
But i think where is fundamental problem with it:
 Previous logic was following:
  1)splice code responsible for: stealing(if possible) and loking the page
  2)prepare_write() code responsible for: do fs speciffic stuff

 But with new write_begin() logic  all steps (grubbing, locking, preparing)
 happened internaly inside write_begin() witch doesn't even know about what
 kind of data will be copied between write_begin/write_end.
 So fancy zero copy logic is impossible :(
I think this can be solved somehow, but i dont know yet, how can this be done
without implementing it inside begin_write().

>  
> -		page = buf->page;
> -		if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) {
> -			unlock_page(page);
> -			goto find_page;
> -		}
> -
> -		page_cache_get(page);
> -
> -		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
> -			lru_cache_add(page);
> -	} else {
> -find_page:
> -		page = find_lock_page(mapping, index);
> -		if (!page) {
> -			ret = -ENOMEM;
> -			page = page_cache_alloc_cold(mapping);
> -			if (unlikely(!page))
> -				goto out_ret;
> -
> -			/*
> -			 * This will also lock the page
> -			 */
> -			ret = add_to_page_cache_lru(page, mapping, index,
> -						    GFP_KERNEL);
> -			if (unlikely(ret))
> -				goto out;
> -		}
> -
> -		/*
> -		 * We get here with the page locked. If the page is also
> -		 * uptodate, we don't need to do more. If it isn't, we
> -		 * may need to bring it in if we are not going to overwrite
> -		 * the full page.
> -		 */
> -		if (!PageUptodate(page)) {
> -			if (this_len < PAGE_CACHE_SIZE) {
> -				ret = mapping->a_ops->readpage(file, page);
> -				if (unlikely(ret))
> -					goto out;
> -
> -				lock_page(page);
> -
> -				if (!PageUptodate(page)) {
> -					/*
> -					 * Page got invalidated, repeat.
> -					 */
> -					if (!page->mapping) {
> -						unlock_page(page);
> -						page_cache_release(page);
> -						goto find_page;
> -					}
> -					ret = -EIO;
> -					goto out;
> -				}
> -			} else
> -				SetPageUptodate(page);
> -		}
> -	}
> -
> -	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
> -	if (unlikely(ret)) {
> -		loff_t isize = i_size_read(mapping->host);
> -
> -		if (ret != AOP_TRUNCATED_PAGE)
> -			unlock_page(page);
> -		page_cache_release(page);
> -		if (ret == AOP_TRUNCATED_PAGE)
> -			goto find_page;
> -
> -		/*
> -		 * prepare_write() may have instantiated a few blocks
> -		 * outside i_size.  Trim these off again.
> -		 */
> -		if (sd->pos + this_len > isize)
> -			vmtruncate(mapping->host, isize);
> -
> -		goto out_ret;
> -	}
> +	ret = pagecache_write_begin(file, mapping, sd->pos, sd->len, 0, &page, &fsdata);
> +	if (unlikely(ret))
> +		goto out;
>  
>  	if (buf->page != page) {
>  		/*
> @@ -676,28 +601,13 @@ find_page:
>  		char *dst = kmap_atomic(page, KM_USER1);
>  
>  		memcpy(dst + offset, src + buf->offset, this_len);
> -		flush_dcache_page(page);
>  		kunmap_atomic(dst, KM_USER1);
>  		buf->ops->unmap(pipe, buf, src);
>  	}
>  
> -	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
> -	if (!ret) {
> -		/*
> -		 * Return the number of bytes written and mark page as
> -		 * accessed, we are now done!
> -		 */
> -		ret = this_len;
> -		mark_page_accessed(page);
> -		balance_dirty_pages_ratelimited(mapping);
> -	} else if (ret == AOP_TRUNCATED_PAGE) {
> -		page_cache_release(page);
> -		goto find_page;
> -	}
> +	ret = pagecache_write_end(file, mapping, sd->pos, sd->len, sd->len, page, fsdata);
> +
>  out:
> -	page_cache_release(page);
> -	unlock_page(page);
> -out_ret:
>  	return ret;
>  }
>  


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15  9:44   ` Dmitriy Monakhov
@ 2007-03-15 10:04     ` Nick Piggin
  0 siblings, 0 replies; 19+ messages in thread
From: Nick Piggin @ 2007-03-15 10:04 UTC (permalink / raw)
  To: Dmitriy Monakhov
  Cc: Nick Piggin, Linux Filesystems, Linux Kernel, Christoph Hellwig,
	Andrew Morton, Mark Fasheh

Dmitriy Monakhov wrote:
> Nick Piggin <npiggin@suse.de> writes:
> 
> 
>>Index: linux-2.6/fs/splice.c
>>===================================================================
>>--- linux-2.6.orig/fs/splice.c
>>+++ linux-2.6/fs/splice.c
>>@@ -559,7 +559,7 @@ static int pipe_to_file(struct pipe_inod
>> 	struct address_space *mapping = file->f_mapping;
>> 	unsigned int offset, this_len;
>> 	struct page *page;
>>-	pgoff_t index;
>>+	void *fsdata;
>> 	int ret;
>> 
>> 	/*
>>@@ -569,13 +569,13 @@ static int pipe_to_file(struct pipe_inod
>> 	if (unlikely(ret))
>> 		return ret;
>> 
>>-	index = sd->pos >> PAGE_CACHE_SHIFT;
>> 	offset = sd->pos & ~PAGE_CACHE_MASK;
>> 
>> 	this_len = sd->len;
>> 	if (this_len + offset > PAGE_CACHE_SIZE)
>> 		this_len = PAGE_CACHE_SIZE - offset;
>> 
>>+#if 0
>> 	/*
>> 	 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
>> 	 * page.
>>@@ -587,86 +587,11 @@ static int pipe_to_file(struct pipe_inod
>> 		 * locked on successful return.
>> 		 */
>> 		if (buf->ops->steal(pipe, buf))
>>-			goto find_page;
>>+#endif
> 
> One more note. It's looks like you just disabled all fancy zero copy logic.
> Off corse this is just rfc patchset.
> But i think where is fundamental problem with it:
>  Previous logic was following:
>   1)splice code responsible for: stealing(if possible) and loking the page
>   2)prepare_write() code responsible for: do fs speciffic stuff
> 
>  But with new write_begin() logic  all steps (grubbing, locking, preparing)
>  happened internaly inside write_begin() witch doesn't even know about what
>  kind of data will be copied between write_begin/write_end.
>  So fancy zero copy logic is impossible :(

Check linux-mm: zero-copy splice is broken anyway, and AFAIKS it cannot
really be fixed to work with the current prepare_write.

> I think this can be solved somehow, but i dont know yet, how can this be done
> without implementing it inside begin_write().

Actually we could do it with begin_write. All we need to do is set a
flag to say that *pagep contains a page that we can use, with a copy of
the write data on it.

The filesystem would then be able to insert that page into the pagecache
*if* it could handle such an operation.

OTOH, is splice stealing support really important? I guess it could be a
nice win for a small niche of workloads, and we probably don't want to
exclude it by design...

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15  4:36     ` Nick Piggin
  2007-03-15  6:11       ` Mark Fasheh
  2007-03-15  6:23       ` Joel Becker
@ 2007-03-15 16:24       ` Steven Whitehouse
  2 siblings, 0 replies; 19+ messages in thread
From: Steven Whitehouse @ 2007-03-15 16:24 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, Christoph Hellwig, Linux Kernel, Linux Filesystems,
	Mark Fasheh

Hi,

On Thu, 2007-03-15 at 05:36 +0100, Nick Piggin wrote:
> On Wed, Mar 14, 2007 at 09:13:29PM -0700, Mark Fasheh wrote:
[some comments snipped]
> > Attached is a quick patch to hook up the existing ocfs2 write code. This has
> > been compile tested only for now - one of my test machines isn't
> > cooperating, so a runtime test will have to wait until tommorrow.
> > 
> > One interesting side effect is that we no longer pass AOP_TRUNCATE_PAGE up a
> > level. This gives callers less to deal with. And it means that ocfs2 doesn't
> > have to use the ocfs2_*_lock_with_page() cluster lock variants in
> > ocfs2_block_write_begin() because it can order cluster locks outside of the
> > page lock there.
> 
> OK that's very cool. I was hoping that would be the case. If GFS2 can
> avoid that too, then we might be able to get rid of AOP_TRUNCATE_PAGE
> handling from the legacy prepare/commit_write paths, which will make
> them simpler.
> 
Yes, I agree that with the new operations GFS2 should also no longer
need AOP_TRUNCATE_PAGE for writes,

Steve.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15  4:13   ` Mark Fasheh
  2007-03-15  4:36     ` Nick Piggin
@ 2007-03-15 20:06     ` Trond Myklebust
  2007-03-15 20:44       ` Mark Fasheh
  1 sibling, 1 reply; 19+ messages in thread
From: Trond Myklebust @ 2007-03-15 20:06 UTC (permalink / raw)
  To: Mark Fasheh
  Cc: Nick Piggin, Linux Filesystems, Linux Kernel, Christoph Hellwig,
	Andrew Morton

On Wed, 2007-03-14 at 21:13 -0700, Mark Fasheh wrote:
> Hi Nick,
> 
> On Wed, Mar 14, 2007 at 02:38:22PM +0100, Nick Piggin wrote:
> > Introduce write_begin, write_end, and perform_write aops.
> > 
> > These are intended to replace prepare_write and commit_write with more
> > flexible alternatives that are also able to avoid the buffered write
> > deadlock problems efficiently (which prepare_write is unable to do).
> 
> > Index: linux-2.6/include/linux/fs.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/fs.h
> > +++ linux-2.6/include/linux/fs.h
> > @@ -449,6 +449,17 @@ struct address_space_operations {
> >  	 */
> >  	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
> >  	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> > +
> > +	int (*write_begin)(struct file *, struct address_space *mapping,
> > +				loff_t pos, unsigned len, int intr,
> > +				struct page **pagep, void **fsdata);
> > +	int (*write_end)(struct file *, struct address_space *mapping,
> > +				loff_t pos, unsigned len, unsigned copied,
> > +				struct page *page, void *fsdata);
> 
> Are we going to get rid of the file and intr arguments btw? I'm not sure
> intr is useful, and mapping is probably enough to get whatever we inside
> ->write_begin / ->write_end.

Hell no! Struct file carries information that is essential for those of
us that use strong authentication. It stays.

Trond


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [patch 2/5] fs: introduce new aops and infrastructure
  2007-03-15 20:06     ` Trond Myklebust
@ 2007-03-15 20:44       ` Mark Fasheh
  0 siblings, 0 replies; 19+ messages in thread
From: Mark Fasheh @ 2007-03-15 20:44 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Nick Piggin, Linux Filesystems, Linux Kernel, Christoph Hellwig,
	Andrew Morton

On Thu, Mar 15, 2007 at 04:06:39PM -0400, Trond Myklebust wrote:
> Hell no! Struct file carries information that is essential for those of
> us that use strong authentication. It stays.

Joel pointed this out yesterday for the nfs case. Not to worry, we'll keep
struct file around :)
	--Mark

--
Mark Fasheh
Senior Software Developer, Oracle
mark.fasheh@oracle.com

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2007-03-15 20:45 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-14 13:38 [patch 1/5] fs: add an iovec iterator Nick Piggin
2007-03-14 13:38 ` [patch 2/5] fs: introduce new aops and infrastructure Nick Piggin
2007-03-14 21:28   ` Dmitriy Monakhov
2007-03-15  3:55     ` Nick Piggin
     [not found]   ` <200703142246.27167.m.kozlowski@tuxland.pl>
2007-03-15  3:58     ` Nick Piggin
2007-03-15  4:13   ` Mark Fasheh
2007-03-15  4:36     ` Nick Piggin
2007-03-15  6:11       ` Mark Fasheh
2007-03-15  6:23       ` Joel Becker
2007-03-15  8:04         ` Nick Piggin
2007-03-15 16:24       ` Steven Whitehouse
2007-03-15 20:06     ` Trond Myklebust
2007-03-15 20:44       ` Mark Fasheh
2007-03-15  9:44   ` Dmitriy Monakhov
2007-03-15 10:04     ` Nick Piggin
2007-03-14 13:38 ` [patch 3/5] fs: convert some simple filesystems Nick Piggin
2007-03-14 13:38 ` [patch 4/5] ext2: convert to new aops Nick Piggin
2007-03-14 13:38 ` [patch 5/5] ext3: " Nick Piggin
2007-03-14 13:51 ` [patch 1/5] fs: add an iovec iterator Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).