* [PATCH 1/2] batch-write.patch
@ 2006-06-29 19:17 Hans Reiser
2006-06-29 21:11 ` Chase Venters
2006-06-30 1:50 ` Andrew Morton
0 siblings, 2 replies; 20+ messages in thread
From: Hans Reiser @ 2006-06-29 19:17 UTC (permalink / raw)
To: Andrew Morton; +Cc: LKML
[-- Attachment #1: Type: text/plain, Size: 2 bytes --]
[-- Attachment #2: batched-write.patch --]
[-- Type: text/x-patch, Size: 12164 bytes --]
This patch adds a method batch_write to struct address_space_operations.
A filesystem may want to implement this operation to improve write performance.
Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
it writes one page using prepare_write and commit_write address space operations.
diff -puN mm/filemap.c~batched-write mm/filemap.c
diff -puN include/linux/fs.h~batched-write include/linux/fs.h
diff -puN include/linux/fs.h~batched-write include/linux/fs.h
--- linux-2.6.17-mm3/include/linux/fs.h~batched-write 2006-06-28 21:39:27.000000000 +0400
+++ linux-2.6.17-mm3-root/include/linux/fs.h 2006-06-28 21:39:27.000000000 +0400
@@ -346,6 +346,39 @@ enum positive_aop_returns {
struct page;
struct address_space;
struct writeback_control;
+typedef struct write_descriptor write_descriptor_t;
+
+/*
+ * This is "write_actor" function type, used by write() to copy data from user
+ * space. There are two functions of this type: write_actor and
+ * write_iovec_actor. First is used when user data are in one segment, second
+ * is used in case of vectored write.
+ */
+typedef size_t (*write_actor_t)(struct page *, unsigned long, size_t,
+ const write_descriptor_t *);
+
+/**
+ * struct write_descriptor - set of write arguments
+ * @pos: offset from the start of the file to write to
+ * @count: number of bytes to write
+ * @cur_iov: current i/o vector
+ * @iov_off: offset within current i/o vector
+ * @buf: user data
+ * @actor: function to copy user data with
+ *
+ * This structure is to pass to batch_write address space operation all
+ * information which is needed to continue write.
+ */
+struct write_descriptor {
+ loff_t pos;
+ size_t count;
+ const struct iovec *cur_iov;
+ size_t iov_off;
+ char __user *buf;
+ write_actor_t actor;
+};
+
+#include <linux/pagevec.h>
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -367,6 +400,8 @@ struct address_space_operations {
*/
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ long (*batch_write)(struct file *, const write_descriptor_t *,
+ struct pagevec *, struct page **, size_t *);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff -puN mm/filemap.c~batched-write mm/filemap.c
--- linux-2.6.17-mm3/mm/filemap.c~batched-write 2006-06-28 21:39:27.000000000 +0400
+++ linux-2.6.17-mm3-root/mm/filemap.c 2006-06-28 22:03:59.000000000 +0400
@@ -2160,72 +2160,101 @@ generic_file_direct_write(struct kiocb *
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/**
+ * write_actor - copy data from user buffer
+ * @page: the page to copy data to
+ * @offset: offset within the page
+ * @bytes: number of bytes to copy
+ * @desc: pointer to user buffer is obtained from here
+ *
+ * This is used to copy data from user buffer into @page in case of i/o vector
+ * has 1 segment. In case of write, in short.
+ */
+static size_t write_actor(struct page *page, unsigned long offset,
+ size_t bytes, const write_descriptor_t *desc)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
+ return filemap_copy_from_user(page, offset, desc->buf, bytes);
+}
- pagevec_init(&lru_pvec, 0);
+/**
+ * write_iovec_actor - copy data from i/o vector
+ * @page: the page to copy data to
+ * @offset: offset within the page
+ * @bytes: number of bytes to copy
+ * @desc: current iovec and offset in it are obtained from here
+ *
+ * This is used to copy data from user buffer into @page in case of i/o vector
+ * has more than segment. In case of writev, in short.
+ */
+static size_t write_iovec_actor(struct page *page, unsigned long offset,
+ size_t bytes, const write_descriptor_t *desc)
+{
+ return filemap_copy_from_user_iovec(page, offset, desc->cur_iov,
+ desc->iov_off, bytes);
+}
- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
+/**
+ * generic_batch_write - generic implementation of batched write
+ * @file: the file to write to
+ * @desc: set of write arguments
+ * @lru_pvec: multipage container to batch adding pages to LRU list
+ * @cached_page: allocated but not used on previous call
+ * @written: returned number of bytes successfully written
+ *
+ * This implementation of batch_write method writes not more than one page of
+ * file. It faults in user space, allocates page and calls prepare_write and
+ * commit_write address space operations. User data are copied by an actor
+ * which is set by caller depending on whether write or writev is on the way.
+ */
+static long generic_batch_write(struct file *file,
+ const write_descriptor_t *desc,
+ struct pagevec *lru_pvec,
+ struct page **cached_page, size_t *written)
+{
+ const struct address_space_operations *a_ops = file->f_mapping->a_ops;
+ struct page *page;
+ unsigned long index;
+ size_t bytes;
+ unsigned long offset;
+ long status;
+
+ /* offset within page write is to start at */
+ offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
+ /* index of page we are to write to */
+ index = desc->pos >> PAGE_CACHE_SHIFT;
- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
+ /* number of bytes which can be written to the page */
+ bytes = PAGE_CACHE_SIZE - offset;
- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ /* Limit the size of the copy to the caller's write size */
+ bytes = min(bytes, desc->count);
+ /*
+ * Limit the size of the copy to that of the current segment,
+ * because fault_in_pages_readable() doesn't know how to walk
+ * segments.
+ */
+ bytes = min(bytes, desc->cur_iov->iov_len - desc->iov_off);
+
+ while (1) {
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ fault_in_pages_readable(desc->buf, bytes);
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
+ page = __grab_cache_page(file->f_mapping, index, cached_page,
+ lru_pvec);
+ if (!page)
+ return -ENOMEM;
- status = a_ops->prepare_write(file, page, offset, offset+bytes);
+ status = a_ops->prepare_write(file, page, offset,
+ offset+bytes);
if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(file->f_mapping->host);
if (status != AOP_TRUNCATED_PAGE)
unlock_page(page);
@@ -2233,57 +2262,127 @@ generic_file_buffered_write(struct kiocb
if (status == AOP_TRUNCATED_PAGE)
continue;
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * prepare_write() may have instantiated a few
+ * blocks outside i_size. Trim these off
+ * again.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ if (desc->pos + bytes > isize)
+ vmtruncate(file->f_mapping->host, isize);
+ return status;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+
+ /*
+ * call write actor in order to copy user data to the
+ * page
+ */
+ *written = desc->actor(page, offset, bytes, desc);
+
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (status == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
continue;
}
- if (likely(copied > 0)) {
- if (!status)
- status = copied;
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
- filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
+ break;
+ }
+ /*
+ * If commit_write returned error - write failed and we zero
+ * number of written bytes. If write_actor copied less than it
+ * was asked to we return -EFAULT and number of bytes
+ * actually written.
+ */
+ if (status)
+ *written = 0;
+ else if (*written != bytes)
+ status = -EFAULT;
+ return status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status;
+ struct page *cached_page = NULL;
+ struct pagevec lru_pvec;
+ write_descriptor_t desc;
+ size_t copied = 0;
+
+ pagevec_init(&lru_pvec, 0);
+
+ /*
+ * initialize write descriptor fields: position to write to
+ * and number of bytes to write
+ */
+ desc.pos = pos;
+ desc.count = count;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1)) {
+ desc.cur_iov = iov;
+ desc.iov_off = written;
+ desc.actor = write_actor;
+ } else {
+ filemap_set_next_iovec(&desc.cur_iov, &desc.iov_off, written);
+ desc.actor = write_iovec_actor;
+ }
+ /* pointer to user buffer */
+ desc.buf = desc.cur_iov->iov_base + desc.iov_off;
+
+ do {
+ /*
+ * When calling the filesystem for writes, there is processing
+ * that must be done:
+ * 1) per word
+ * 2) per page
+ * 3) per call to the FS
+ * If the FS is called per page, then it turns out that 3)
+ * costs more than 1) and 2) for sophisticated filesystems. To
+ * allow the FS to choose to pay the cost of 3) only once we
+ * call batch_write, if the FS supports it.
+ */
+ if (a_ops->batch_write)
+ status = a_ops->batch_write(file, &desc, &lru_pvec,
+ &cached_page, &copied);
+ else
+ status = generic_batch_write(file, &desc, &lru_pvec,
+ &cached_page, &copied);
+ if (likely(copied > 0)) {
+ written += copied;
+ desc.count -= copied;
+ if (desc.count) {
+ /*
+ * not everything is written yet. Adjust write
+ * descriptor for next iteration
+ */
+ desc.pos += copied;
+ if (unlikely(nr_segs > 1))
+ filemap_set_next_iovec(&desc.cur_iov,
+ &desc.iov_off,
+ copied);
+ else
+ desc.iov_off += copied;
+ desc.buf = desc.cur_iov->iov_base +
+ desc.iov_off;
+ }
+ }
if (status < 0)
break;
balance_dirty_pages_ratelimited(mapping);
cond_resched();
- } while (count);
- *ppos = pos;
+ } while (desc.count);
+ *ppos = pos + written;
if (cached_page)
page_cache_release(cached_page);
_
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-06-29 19:17 [PATCH 1/2] batch-write.patch Hans Reiser
@ 2006-06-29 21:11 ` Chase Venters
2006-06-29 21:21 ` Chase Venters
2006-07-06 4:45 ` Hans Reiser
2006-06-30 1:50 ` Andrew Morton
1 sibling, 2 replies; 20+ messages in thread
From: Chase Venters @ 2006-06-29 21:11 UTC (permalink / raw)
To: Hans Reiser; +Cc: Andrew Morton, LKML
On Thu, 29 Jun 2006, Hans Reiser wrote:
>
> (patch was attached)
>
Not quoted because patch isn't inlined, but you're testing for the
presence of the batch_write pointer repeatedly in the loop. How about
declare a batch_write ptr on the stack and then do your test once, outside
of your do { } while (count) loop, and then set it to the generic method
(before entering the loop) if the generic method isn't available?
Thanks,
Chase
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-06-29 21:11 ` Chase Venters
@ 2006-06-29 21:21 ` Chase Venters
2006-07-06 4:45 ` Hans Reiser
1 sibling, 0 replies; 20+ messages in thread
From: Chase Venters @ 2006-06-29 21:21 UTC (permalink / raw)
To: Chase Venters; +Cc: Hans Reiser, Andrew Morton, LKML
On Thu, 29 Jun 2006, Chase Venters wrote:
> On Thu, 29 Jun 2006, Hans Reiser wrote:
>
>>
>> (patch was attached)
>>
>
> Not quoted because patch isn't inlined, but you're testing for the presence
> of the batch_write pointer repeatedly in the loop. How about declare a
> batch_write ptr on the stack and then do your test once, outside of your do {
> } while (count) loop, and then set it to the generic method (before entering
> the loop) if the generic method isn't available?
^^^^^^^^^
s/available/overridden/;
> Thanks,
> Chase
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-06-29 19:17 [PATCH 1/2] batch-write.patch Hans Reiser
2006-06-29 21:11 ` Chase Venters
@ 2006-06-30 1:50 ` Andrew Morton
2006-07-04 11:12 ` Vladimir V. Saveliev
1 sibling, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2006-06-30 1:50 UTC (permalink / raw)
To: Hans Reiser; +Cc: linux-kernel, reiserfs-dev
On Thu, 29 Jun 2006 12:17:36 -0700
Hans Reiser <reiser@namesys.com> wrote:
>
>
> This patch adds a method batch_write to struct address_space_operations.
> A filesystem may want to implement this operation to improve write performance.
> Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
> it writes one page using prepare_write and commit_write address space operations.
>
Please remember to fill in the authorship and signed-off-by info.
The approach seems sane.
>
>
>
> diff -puN include/linux/fs.h~batched-write include/linux/fs.h
> --- linux-2.6.17-mm3/include/linux/fs.h~batched-write 2006-06-28 21:39:27.000000000 +0400
> +++ linux-2.6.17-mm3-root/include/linux/fs.h 2006-06-28 21:39:27.000000000 +0400
> @@ -346,6 +346,39 @@ enum positive_aop_returns {
> struct page;
> struct address_space;
> struct writeback_control;
> +typedef struct write_descriptor write_descriptor_t;
Let's just use `struct write descriptor' throughout please.
> +
> +/*
> + * This is "write_actor" function type, used by write() to copy data from user
> + * space. There are two functions of this type: write_actor and
> + * write_iovec_actor. First is used when user data are in one segment, second
> + * is used in case of vectored write.
> + */
> +typedef size_t (*write_actor_t)(struct page *, unsigned long, size_t,
> + const write_descriptor_t *);
But yes, we do use typedefs for these things - they're such a mouthful.
Can we please fill in the variable names in declarations such as this? You
look at it and wonder what that `unsigned long' and `size_t' actually _do_.
It's so much clearer if the names are filled in.
> +/**
> + * struct write_descriptor - set of write arguments
> + * @pos: offset from the start of the file to write to
> + * @count: number of bytes to write
> + * @cur_iov: current i/o vector
> + * @iov_off: offset within current i/o vector
> + * @buf: user data
> + * @actor: function to copy user data with
> + *
> + * This structure is to pass to batch_write address space operation all
> + * information which is needed to continue write.
> + */
> +struct write_descriptor {
> + loff_t pos;
> + size_t count;
> + const struct iovec *cur_iov;
> + size_t iov_off;
> + char __user *buf;
> + write_actor_t actor;
> +};
Boy, it's complex, isn't it? Not your fault though.
> +#include <linux/pagevec.h>
A simple declaration:
struct pagevec;
would suffice here. Please place it right at the top-of-file, with the others.
> struct address_space_operations {
> int (*writepage)(struct page *page, struct writeback_control *wbc);
> @@ -367,6 +400,8 @@ struct address_space_operations {
> */
> int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
> int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> + long (*batch_write)(struct file *, const write_descriptor_t *,
> + struct pagevec *, struct page **, size_t *);
It's nice to fill in the names here too, IMO. Yes, there are past sins
there.
Should this be an address_space_operation or a file_operation?
> +/**
> + * generic_batch_write - generic implementation of batched write
gneric_file_batch_write(), perhaps? I guess that depends upon the
answer to the previous question.
> + * @file: the file to write to
> + * @desc: set of write arguments
> + * @lru_pvec: multipage container to batch adding pages to LRU list
> + * @cached_page: allocated but not used on previous call
> + * @written: returned number of bytes successfully written
> + *
> + * This implementation of batch_write method writes not more than one page of
> + * file. It faults in user space, allocates page and calls prepare_write and
> + * commit_write address space operations. User data are copied by an actor
> + * which is set by caller depending on whether write or writev is on the way.
> + */
> +static long generic_batch_write(struct file *file,
> + const write_descriptor_t *desc,
> + struct pagevec *lru_pvec,
> + struct page **cached_page, size_t *written)
I wonder if we really need that cached_page thing.
Did you consider putting more of these arguments into the (now non-const)
`struct write_descriptor'? It'd be sligtly more efficient and would save
stack on some of our stack-hungriest codepaths.
> +{
> + const struct address_space_operations *a_ops = file->f_mapping->a_ops;
> + struct page *page;
> + unsigned long index;
> + size_t bytes;
> + unsigned long offset;
> + long status;
> +
> + /* offset within page write is to start at */
> + offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
>
> - do {
> - unsigned long index;
> - unsigned long offset;
> - size_t copied;
> -
> - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
> - index = pos >> PAGE_CACHE_SHIFT;
> - bytes = PAGE_CACHE_SIZE - offset;
> + /* index of page we are to write to */
> + index = desc->pos >> PAGE_CACHE_SHIFT;
>
> - /* Limit the size of the copy to the caller's write size */
> - bytes = min(bytes, count);
> + /* number of bytes which can be written to the page */
> + bytes = PAGE_CACHE_SIZE - offset;
>
> - /*
> - * Limit the size of the copy to that of the current segment,
> - * because fault_in_pages_readable() doesn't know how to walk
> - * segments.
> - */
> - bytes = min(bytes, cur_iov->iov_len - iov_base);
> + /* Limit the size of the copy to the caller's write size */
> + bytes = min(bytes, desc->count);
>
> + /*
> + * Limit the size of the copy to that of the current segment,
> + * because fault_in_pages_readable() doesn't know how to walk
> + * segments.
> + */
> + bytes = min(bytes, desc->cur_iov->iov_len - desc->iov_off);
> +
> + while (1) {
> /*
> * Bring in the user page that we will copy from _first_.
> * Otherwise there's a nasty deadlock on copying from the
> * same page as we're writing to, without it being marked
> * up-to-date.
> */
> - fault_in_pages_readable(buf, bytes);
> + fault_in_pages_readable(desc->buf, bytes);
>
> - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
> - if (!page) {
> - status = -ENOMEM;
> - break;
> - }
> + page = __grab_cache_page(file->f_mapping, index, cached_page,
> + lru_pvec);
> + if (!page)
> + return -ENOMEM;
>
> - status = a_ops->prepare_write(file, page, offset, offset+bytes);
> + status = a_ops->prepare_write(file, page, offset,
> + offset+bytes);
The code you're patching here changed this morning, and it'll change again
(a little bit) in the next few days.
And Neil has suggested some simplifications to filemap_set_next_iovec(),
although I don't think anyone's working on that at present (everyone's
justifiably afraid to touch this code).
> +
> +ssize_t
> +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos, loff_t *ppos,
> + size_t count, ssize_t written)
> +{
> + struct file *file = iocb->ki_filp;
> + struct address_space * mapping = file->f_mapping;
> + const struct address_space_operations *a_ops = mapping->a_ops;
> + struct inode *inode = mapping->host;
> + long status;
> + struct page *cached_page = NULL;
> + struct pagevec lru_pvec;
> + write_descriptor_t desc;
> + size_t copied = 0;
> +
> + pagevec_init(&lru_pvec, 0);
> +
> + /*
> + * initialize write descriptor fields: position to write to
> + * and number of bytes to write
> + */
> + desc.pos = pos;
> + desc.count = count;
> +
> + /*
> + * handle partial DIO write. Adjust cur_iov if needed.
> + */
> + if (likely(nr_segs == 1)) {
> + desc.cur_iov = iov;
> + desc.iov_off = written;
> + desc.actor = write_actor;
> + } else {
> + filemap_set_next_iovec(&desc.cur_iov, &desc.iov_off, written);
> + desc.actor = write_iovec_actor;
> + }
> + /* pointer to user buffer */
> + desc.buf = desc.cur_iov->iov_base + desc.iov_off;
> +
> + do {
> + /*
> + * When calling the filesystem for writes, there is processing
> + * that must be done:
> + * 1) per word
> + * 2) per page
> + * 3) per call to the FS
> + * If the FS is called per page, then it turns out that 3)
> + * costs more than 1) and 2) for sophisticated filesystems. To
> + * allow the FS to choose to pay the cost of 3) only once we
> + * call batch_write, if the FS supports it.
> + */
I think it's better that the more philosophical comments like this go in
the comment block at the start of the function.
> + if (a_ops->batch_write)
> + status = a_ops->batch_write(file, &desc, &lru_pvec,
> + &cached_page, &copied);
> + else
> + status = generic_batch_write(file, &desc, &lru_pvec,
> + &cached_page, &copied);
> + if (likely(copied > 0)) {
This is missing this morning's handle-zero-length-segment bugfix.
> + written += copied;
> + desc.count -= copied;
> + if (desc.count) {
> + /*
> + * not everything is written yet. Adjust write
> + * descriptor for next iteration
> + */
> + desc.pos += copied;
> + if (unlikely(nr_segs > 1))
> + filemap_set_next_iovec(&desc.cur_iov,
> + &desc.iov_off,
> + copied);
> + else
> + desc.iov_off += copied;
> + desc.buf = desc.cur_iov->iov_base +
> + desc.iov_off;
> + }
> + }
> if (status < 0)
> break;
> balance_dirty_pages_ratelimited(mapping);
> cond_resched();
> - } while (count);
> - *ppos = pos;
> + } while (desc.count);
> + *ppos = pos + written;
>
> if (cached_page)
> page_cache_release(cached_page);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-06-30 1:50 ` Andrew Morton
@ 2006-07-04 11:12 ` Vladimir V. Saveliev
2006-07-04 11:21 ` Vladimir V. Saveliev
2006-07-04 11:48 ` Christoph Hellwig
0 siblings, 2 replies; 20+ messages in thread
From: Vladimir V. Saveliev @ 2006-07-04 11:12 UTC (permalink / raw)
To: Andrew Morton; +Cc: lkml, reiserfs-dev
[-- Attachment #1: Type: text/plain, Size: 10897 bytes --]
Hello
Andrew, thanks a lot for your improvements.
Here is new version of the patch.
On Thu, 2006-06-29 at 18:50 -0700, Andrew Morton wrote:
> On Thu, 29 Jun 2006 12:17:36 -0700
> Hans Reiser <reiser@namesys.com> wrote:
> >
> >
> > This patch adds a method batch_write to struct address_space_operations.
> > A filesystem may want to implement this operation to improve write performance.
> > Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
> > it writes one page using prepare_write and commit_write address space operations.
> >
>
> Please remember to fill in the authorship and signed-off-by info.
>
done.
> The approach seems sane.
>
> >
> >
> >
> > diff -puN include/linux/fs.h~batched-write include/linux/fs.h
> > --- linux-2.6.17-mm3/include/linux/fs.h~batched-write 2006-06-28 21:39:27.000000000 +0400
> > +++ linux-2.6.17-mm3-root/include/linux/fs.h 2006-06-28 21:39:27.000000000 +0400
> > @@ -346,6 +346,39 @@ enum positive_aop_returns {
> > struct page;
> > struct address_space;
> > struct writeback_control;
> > +typedef struct write_descriptor write_descriptor_t;
>
> Let's just use `struct write descriptor' throughout please.
>
done.
> > +
> > +/*
> > + * This is "write_actor" function type, used by write() to copy data from user
> > + * space. There are two functions of this type: write_actor and
> > + * write_iovec_actor. First is used when user data are in one segment, second
> > + * is used in case of vectored write.
> > + */
> > +typedef size_t (*write_actor_t)(struct page *, unsigned long, size_t,
> > + const write_descriptor_t *);
>
> But yes, we do use typedefs for these things - they're such a mouthful.
>
As long as now we never call batch_write to write from more than
segment - we do no need write_actor. filemap_copy_from_user is only used
to copy data.
> Can we please fill in the variable names in declarations such as this? You
> look at it and wonder what that `unsigned long' and `size_t' actually _do_.
> It's so much clearer if the names are filled in.
>
> > +/**
> > + * struct write_descriptor - set of write arguments
> > + * @pos: offset from the start of the file to write to
> > + * @count: number of bytes to write
> > + * @cur_iov: current i/o vector
> > + * @iov_off: offset within current i/o vector
> > + * @buf: user data
> > + * @actor: function to copy user data with
> > + *
> > + * This structure is to pass to batch_write address space operation all
> > + * information which is needed to continue write.
> > + */
> > +struct write_descriptor {
> > + loff_t pos;
> > + size_t count;
> > + const struct iovec *cur_iov;
> > + size_t iov_off;
> > + char __user *buf;
> > + write_actor_t actor;
> > +};
>
> Boy, it's complex, isn't it? Not your fault though.
>
struct write_descriptor changed. It has 5 fields now.
> > +#include <linux/pagevec.h>
>
> A simple declaration:
>
> struct pagevec;
>
> would suffice here. Please place it right at the top-of-file, with the others.
>
done.
> > struct address_space_operations {
> > int (*writepage)(struct page *page, struct writeback_control *wbc);
> > @@ -367,6 +400,8 @@ struct address_space_operations {
> > */
> > int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
> > int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> > + long (*batch_write)(struct file *, const write_descriptor_t *,
> > + struct pagevec *, struct page **, size_t *);
>
> It's nice to fill in the names here too, IMO. Yes, there are past sins
> there.
>
done.
> Should this be an address_space_operation or a file_operation?
>
I was seeking to be minimal in my changes to the philosophy of the code.
So, it was an address_space operation. Now it is a file operation.
> > +/**
> > + * generic_batch_write - generic implementation of batched write
>
> gneric_file_batch_write(), perhaps? I guess that depends upon the
> answer to the previous question.
>
renamed.
> > + * @file: the file to write to
> > + * @desc: set of write arguments
> > + * @lru_pvec: multipage container to batch adding pages to LRU list
> > + * @cached_page: allocated but not used on previous call
> > + * @written: returned number of bytes successfully written
> > + *
> > + * This implementation of batch_write method writes not more than one page of
> > + * file. It faults in user space, allocates page and calls prepare_write and
> > + * commit_write address space operations. User data are copied by an actor
> > + * which is set by caller depending on whether write or writev is on the way.
> > + */
> > +static long generic_batch_write(struct file *file,
> > + const write_descriptor_t *desc,
> > + struct pagevec *lru_pvec,
> > + struct page **cached_page, size_t *written)
>
> I wonder if we really need that cached_page thing.
>
> Did you consider putting more of these arguments into the (now non-const)
> `struct write_descriptor'? It'd be sligtly more efficient and would save
> stack on some of our stack-hungriest codepaths.
>
lru_pvec and cached_page are included to struct write_descriptor.
> > +{
> > + const struct address_space_operations *a_ops = file->f_mapping->a_ops;
> > + struct page *page;
> > + unsigned long index;
> > + size_t bytes;
> > + unsigned long offset;
> > + long status;
> > +
> > + /* offset within page write is to start at */
> > + offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
> >
> > - do {
> > - unsigned long index;
> > - unsigned long offset;
> > - size_t copied;
> > -
> > - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
> > - index = pos >> PAGE_CACHE_SHIFT;
> > - bytes = PAGE_CACHE_SIZE - offset;
> > + /* index of page we are to write to */
> > + index = desc->pos >> PAGE_CACHE_SHIFT;
> >
> > - /* Limit the size of the copy to the caller's write size */
> > - bytes = min(bytes, count);
> > + /* number of bytes which can be written to the page */
> > + bytes = PAGE_CACHE_SIZE - offset;
> >
> > - /*
> > - * Limit the size of the copy to that of the current segment,
> > - * because fault_in_pages_readable() doesn't know how to walk
> > - * segments.
> > - */
> > - bytes = min(bytes, cur_iov->iov_len - iov_base);
> > + /* Limit the size of the copy to the caller's write size */
> > + bytes = min(bytes, desc->count);
> >
> > + /*
> > + * Limit the size of the copy to that of the current segment,
> > + * because fault_in_pages_readable() doesn't know how to walk
> > + * segments.
> > + */
> > + bytes = min(bytes, desc->cur_iov->iov_len - desc->iov_off);
> > +
> > + while (1) {
> > /*
> > * Bring in the user page that we will copy from _first_.
> > * Otherwise there's a nasty deadlock on copying from the
> > * same page as we're writing to, without it being marked
> > * up-to-date.
> > */
> > - fault_in_pages_readable(buf, bytes);
> > + fault_in_pages_readable(desc->buf, bytes);
> >
> > - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
> > - if (!page) {
> > - status = -ENOMEM;
> > - break;
> > - }
> > + page = __grab_cache_page(file->f_mapping, index, cached_page,
> > + lru_pvec);
> > + if (!page)
> > + return -ENOMEM;
> >
> > - status = a_ops->prepare_write(file, page, offset, offset+bytes);
> > + status = a_ops->prepare_write(file, page, offset,
> > + offset+bytes);
>
> The code you're patching here changed this morning, and it'll change again
> (a little bit) in the next few days.
>
New patch is against 2.6.17-mm5.
> And Neil has suggested some simplifications to filemap_set_next_iovec(),
> although I don't think anyone's working on that at present (everyone's
> justifiably afraid to touch this code).
> > +
> > +ssize_t
> > +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> > + unsigned long nr_segs, loff_t pos, loff_t *ppos,
> > + size_t count, ssize_t written)
> > +{
> > + struct file *file = iocb->ki_filp;
> > + struct address_space * mapping = file->f_mapping;
> > + const struct address_space_operations *a_ops = mapping->a_ops;
> > + struct inode *inode = mapping->host;
> > + long status;
> > + struct page *cached_page = NULL;
> > + struct pagevec lru_pvec;
> > + write_descriptor_t desc;
> > + size_t copied = 0;
> > +
> > + pagevec_init(&lru_pvec, 0);
> > +
> > + /*
> > + * initialize write descriptor fields: position to write to
> > + * and number of bytes to write
> > + */
> > + desc.pos = pos;
> > + desc.count = count;
> > +
> > + /*
> > + * handle partial DIO write. Adjust cur_iov if needed.
> > + */
> > + if (likely(nr_segs == 1)) {
> > + desc.cur_iov = iov;
> > + desc.iov_off = written;
> > + desc.actor = write_actor;
> > + } else {
> > + filemap_set_next_iovec(&desc.cur_iov, &desc.iov_off, written);
> > + desc.actor = write_iovec_actor;
> > + }
> > + /* pointer to user buffer */
> > + desc.buf = desc.cur_iov->iov_base + desc.iov_off;
> > +
> > + do {
> > + /*
> > + * When calling the filesystem for writes, there is processing
> > + * that must be done:
> > + * 1) per word
> > + * 2) per page
> > + * 3) per call to the FS
> > + * If the FS is called per page, then it turns out that 3)
> > + * costs more than 1) and 2) for sophisticated filesystems. To
> > + * allow the FS to choose to pay the cost of 3) only once we
> > + * call batch_write, if the FS supports it.
> > + */
>
> I think it's better that the more philosophical comments like this go in
> the comment block at the start of the function.
>
This comment is moved at the start of this function.
> > + if (a_ops->batch_write)
> > + status = a_ops->batch_write(file, &desc, &lru_pvec,
> > + &cached_page, &copied);
> > + else
> > + status = generic_batch_write(file, &desc, &lru_pvec,
> > + &cached_page, &copied);
> > + if (likely(copied > 0)) {
>
> This is missing this morning's handle-zero-length-segment bugfix.
>
I hope new patch deals properly with zero length segments. I writev-ed a
vector which had 3 zero length segments: first, last and 3rd.
> > + written += copied;
> > + desc.count -= copied;
> > + if (desc.count) {
> > + /*
> > + * not everything is written yet. Adjust write
> > + * descriptor for next iteration
> > + */
> > + desc.pos += copied;
> > + if (unlikely(nr_segs > 1))
> > + filemap_set_next_iovec(&desc.cur_iov,
> > + &desc.iov_off,
> > + copied);
> > + else
> > + desc.iov_off += copied;
> > + desc.buf = desc.cur_iov->iov_base +
> > + desc.iov_off;
> > + }
> > + }
> > if (status < 0)
> > break;
> > balance_dirty_pages_ratelimited(mapping);
> > cond_resched();
> > - } while (count);
> > - *ppos = pos;
> > + } while (desc.count);
> > + *ppos = pos + written;
> >
> > if (cached_page)
> > page_cache_release(cached_page);
>
>
[-- Attachment #2: batched-write.patch --]
[-- Type: message/rfc822, Size: 11146 bytes --]
From:
Subject: No Subject
Date: Tue, 04 Jul 2006 00:52:59 +0400
Message-ID: <1151959979.6335.92.camel@tribesman.namesys.com>
From: Vladimir Saveliev <vs@namesys.com>
This patch adds a method batch_write to struct address_space_operations.
A filesystem may want to implement this operation to improve write performance.
Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
it writes one page using prepare_write and commit_write address space operations.
Signed-off-by: Vladimir Saveliev <vs@namesys.com>
diff -puN include/linux/fs.h~batched-write include/linux/fs.h
--- linux-2.6.17-mm5/include/linux/fs.h~batched-write 2006-07-03 18:16:44.000000000 +0400
+++ linux-2.6.17-mm5-vs/include/linux/fs.h 2006-07-04 00:31:41.000000000 +0400
@@ -246,6 +246,7 @@ struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
+struct pagevec;
extern void __init inode_init(unsigned long);
extern void __init inode_init_early(void);
@@ -347,6 +348,25 @@ struct page;
struct address_space;
struct writeback_control;
+/**
+ * struct write_descriptor - set of write arguments
+ * @pos: offset from the start of the file to write to
+ * @count: number of bytes to write
+ * @buf: pointer to data to be written
+ * @lru_pvec: multipage container to batch adding pages to LRU list
+ * @cached_page: allocated but not used on previous call
+ *
+ * This structure is to pass to batch_write address space operation
+ * all information which is needed to continue write.
+ */
+struct write_descriptor {
+ loff_t pos;
+ size_t count;
+ char __user *buf;
+ struct page *cached_page;
+ struct pagevec *lru_pvec;
+};
+
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
@@ -367,6 +387,8 @@ struct address_space_operations {
*/
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ long (*batch_write)(struct file *file, struct write_descriptor *desc,
+ size_t *written);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff -puN mm/filemap.c~batched-write mm/filemap.c
--- linux-2.6.17-mm5/mm/filemap.c~batched-write 2006-07-03 18:16:44.000000000 +0400
+++ linux-2.6.17-mm5-vs/mm/filemap.c 2006-07-04 00:59:56.000000000 +0400
@@ -2159,78 +2159,58 @@ generic_file_direct_write(struct kiocb *
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/**
+ * generic_batch_write - generic implementation of batched write
+ * @file: the file to write to
+ * @desc: set of write arguments
+ * @written: returned number of bytes successfully written
+ *
+ * This implementation of batch_write method writes not more than one page of
+ * file. It faults in user space, allocates page and calls prepare_write and
+ * commit_write address space operations. User data are copied by an actor
+ * which is set by caller depending on whether write or writev is on the way.
+ */
+static long generic_batch_write(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
-
- pagevec_init(&lru_pvec, 0);
-
- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
-
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
-
- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
-
- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ const struct address_space_operations *a_ops = file->f_mapping->a_ops;
+ struct page *page;
+ unsigned long index;
+ size_t bytes;
+ unsigned long offset;
+ long status;
+
+ /* offset within page write is to start at */
+ offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
+
+ /* index of page we are to write to */
+ index = desc->pos >> PAGE_CACHE_SHIFT;
+
+ /* number of bytes which can be written to the page */
+ bytes = PAGE_CACHE_SIZE - offset;
+
+ /* Limit the size of the copy to the caller's write size */
+ bytes = min(bytes, desc->count);
+ BUG_ON(bytes == 0);
+ while (1) {
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ fault_in_pages_readable(desc->buf, bytes);
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
-
- if (unlikely(bytes == 0)) {
- status = 0;
- copied = 0;
- goto zero_length_segment;
- }
+ page = __grab_cache_page(file->f_mapping, index, &desc->cached_page,
+ desc->lru_pvec);
+ if (!page)
+ return -ENOMEM;
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(file->f_mapping->host);
if (status != AOP_TRUNCATED_PAGE)
unlock_page(page);
@@ -2238,61 +2218,139 @@ generic_file_buffered_write(struct kiocb
if (status == AOP_TRUNCATED_PAGE)
continue;
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * prepare_write() may have instantiated a few
+ * blocks outside i_size. Trim these off
+ * again.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ if (desc->pos + bytes > isize)
+ vmtruncate(file->f_mapping->host, isize);
+ return status;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+
+ /*
+ * call write actor in order to copy user data to the
+ * page
+ */
+ *written = filemap_copy_from_user(page, offset, desc->buf,
+ bytes);
+
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (status == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
continue;
}
-zero_length_segment:
- if (likely(copied >= 0)) {
- if (!status)
- status = copied;
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ break;
+ }
+ /*
+ * If commit_write returned error - write failed and we zero
+ * number of written bytes. If filemap_copy_from_user copied
+ * less than it was asked to we return -EFAULT and number of
+ * bytes actually written.
+ */
+ if (status)
+ *written = 0;
+ else if (*written != bytes)
+ status = -EFAULT;
+ return status;
+}
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
+/*
+ * When calling the filesystem for writes, there is processing
+ * that must be done:
+ * 1) per word
+ * 2) per page
+ * 3) per call to the FS
+ * If the FS is called per page, then it turns out that 3) costs more
+ * than 1) and 2) for sophisticated filesystems. To allow the FS to
+ * choose to pay the cost of 3) only once we call batch_write, if the
+ * FS supports it.
+ */
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status;
+ struct pagevec lru_pvec;
+ struct write_descriptor desc;
+ size_t copied = 0;
+ const struct iovec *cur_iov = iov; /* current iovec */
+ size_t iov_base = 0; /* offset in the current iovec */
+ long (*batch_write)(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written);
+
+ pagevec_init(&lru_pvec, 0);
+
+ /*
+ * initialize write descriptor fields: position to write to
+ * and number of bytes to write
+ */
+ desc.pos = pos;
+ desc.cached_page = NULL;
+ desc.lru_pvec = &lru_pvec;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1))
+ iov_base = written;
+ else
+ filemap_set_next_iovec(&cur_iov, &iov_base, written);
+
+ /*
+ * if file system implements batch_write method - use it,
+ * otherwise - use generic_batch_write
+ */
+ if (a_ops->batch_write)
+ batch_write = a_ops->batch_write;
+ else
+ batch_write = generic_batch_write;
+
+ do {
+ /* do not walk over current segment */
+ desc.buf = cur_iov->iov_base + iov_base;
+ desc.count = cur_iov->iov_len - iov_base;
+ if (desc.count > 0)
+ status = batch_write(file, &desc, &copied);
+ else {
+ copied = 0;
+ status = 0;
+ }
+ if (likely(copied >= 0)) {
+ written += copied;
+ count -= copied;
+ if (count) {
+ /*
+ * not everything is written yet. Adjust write
+ * descriptor for next iteration
+ */
+ desc.pos += copied;
+ if (likely(nr_segs == 1))
+ iov_base += copied;
+ else
filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
+ &iov_base,
+ copied);
}
}
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
- if (status < 0)
- break;
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- } while (count);
- *ppos = pos;
-
- if (cached_page)
- page_cache_release(cached_page);
+ if (status < 0)
+ break;
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ } while (count);
+ *ppos = pos + written;
+
+ if (desc.cached_page)
+ page_cache_release(desc.cached_page);
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
_
[-- Attachment #3: batched-write.patch --]
[-- Type: message/rfc822, Size: 12527 bytes --]
From:
Subject: No Subject
Date: Tue, 04 Jul 2006 15:12:56 +0400
Message-ID: <1152011576.6454.37.camel@tribesman.namesys.com>
From: Vladimir Saveliev <vs@namesys.com>
This patch adds a method batch_write to struct file_operations.
A filesystem may want to implement this operation to improve write performance.
Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
it writes one page using prepare_write and commit_write address space operations.
Signed-off-by: Vladimir Saveliev <vs@namesys.com>
diff -puN include/linux/fs.h~batched-write include/linux/fs.h
--- linux-2.6.17-mm5/include/linux/fs.h~batched-write 2006-07-04 14:07:47.000000000 +0400
+++ linux-2.6.17-mm5-vs/include/linux/fs.h 2006-07-04 14:39:11.000000000 +0400
@@ -246,6 +246,7 @@ struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
+struct pagevec;
extern void __init inode_init(unsigned long);
extern void __init inode_init_early(void);
@@ -1096,6 +1097,25 @@ typedef int (*read_actor_t)(read_descrip
#define HAVE_COMPAT_IOCTL 1
#define HAVE_UNLOCKED_IOCTL 1
+/**
+ * struct write_descriptor - set of write arguments
+ * @pos: offset from the start of the file to write to
+ * @count: number of bytes to write
+ * @buf: pointer to data to be written
+ * @lru_pvec: multipage container to batch adding pages to LRU list
+ * @cached_page: allocated but not used on previous call
+ *
+ * This structure is to pass to batch_write file operation all
+ * information which is needed to continue write.
+ */
+struct write_descriptor {
+ loff_t pos;
+ size_t count;
+ char __user *buf;
+ struct page *cached_page;
+ struct pagevec *lru_pvec;
+};
+
/*
* NOTE:
* read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
@@ -1123,6 +1143,8 @@ struct file_operations {
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ long (*batch_write)(struct file *file, struct write_descriptor *desc,
+ size_t *written);
ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
diff -puN mm/filemap.c~batched-write mm/filemap.c
--- linux-2.6.17-mm5/mm/filemap.c~batched-write 2006-07-04 14:07:47.000000000 +0400
+++ linux-2.6.17-mm5-vs/mm/filemap.c 2006-07-04 15:09:58.000000000 +0400
@@ -2159,78 +2159,59 @@ generic_file_direct_write(struct kiocb *
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/**
+ * generic_file_batch_write - generic batch_write file operation
+ * @file: the file to write to
+ * @desc: set of write arguments
+ * @written: returned number of bytes successfully written
+ *
+ * This implementation of batch_write file operation method writes not
+ * more than one page of file. It faults in user space, allocates page
+ * and calls prepare_write and commit_write address space
+ * operations. User data are copied by filemap_copy_from_user.
+ */
+static long generic_file_batch_write(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
-
- pagevec_init(&lru_pvec, 0);
-
- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
-
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
-
- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
-
- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ const struct address_space_operations *a_ops = file->f_mapping->a_ops;
+ struct page *page;
+ unsigned long index;
+ size_t bytes;
+ unsigned long offset;
+ long status;
+
+ /* offset within page write is to start at */
+ offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
+
+ /* index of page we are to write to */
+ index = desc->pos >> PAGE_CACHE_SHIFT;
+
+ /* number of bytes which can be written to the page */
+ bytes = PAGE_CACHE_SIZE - offset;
+
+ /* limit the size of the copy to the caller's write size */
+ bytes = min(bytes, desc->count);
+ BUG_ON(bytes == 0);
+ while (1) {
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ fault_in_pages_readable(desc->buf, bytes);
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
+ page = __grab_cache_page(file->f_mapping, index,
+ &desc->cached_page, desc->lru_pvec);
+ if (!page)
+ return -ENOMEM;
- if (unlikely(bytes == 0)) {
- status = 0;
- copied = 0;
- goto zero_length_segment;
- }
-
- status = a_ops->prepare_write(file, page, offset, offset+bytes);
+ status = a_ops->prepare_write(file, page, offset,
+ offset+bytes);
if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(file->f_mapping->host);
if (status != AOP_TRUNCATED_PAGE)
unlock_page(page);
@@ -2238,61 +2219,139 @@ generic_file_buffered_write(struct kiocb
if (status == AOP_TRUNCATED_PAGE)
continue;
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * prepare_write() may have instantiated a few
+ * blocks outside i_size. Trim these off
+ * again.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ if (desc->pos + bytes > isize)
+ vmtruncate(file->f_mapping->host, isize);
+ return status;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+
+ /*
+ * call write actor in order to copy user data to the
+ * page
+ */
+ *written = filemap_copy_from_user(page, offset, desc->buf,
+ bytes);
+
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (status == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
continue;
}
-zero_length_segment:
- if (likely(copied >= 0)) {
- if (!status)
- status = copied;
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ break;
+ }
+ /*
+ * If commit_write returned error - write failed and we zero
+ * number of written bytes. If filemap_copy_from_user copied
+ * less than it was asked to we return -EFAULT and number of
+ * bytes actually written.
+ */
+ if (status)
+ *written = 0;
+ else if (*written != bytes)
+ status = -EFAULT;
+ return status;
+}
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
+/*
+ * When calling the filesystem for writes, there is processing
+ * that must be done:
+ * 1) per word
+ * 2) per page
+ * 3) per call to the FS
+ * If the FS is called per page, then it turns out that 3) costs more
+ * than 1) and 2) for sophisticated filesystems. To allow the FS to
+ * choose to pay the cost of 3) only once we call batch_write, if the
+ * FS supports it.
+ */
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status;
+ struct pagevec lru_pvec;
+ struct write_descriptor desc;
+ size_t copied = 0;
+ const struct iovec *cur_iov = iov; /* current iovec */
+ size_t iov_base = 0; /* offset in the current iovec */
+ long (*batch_write)(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written);
+
+ pagevec_init(&lru_pvec, 0);
+
+ /*
+ * initialize write descriptor fields: position to write to
+ * and number of bytes to write
+ */
+ desc.pos = pos;
+ desc.cached_page = NULL;
+ desc.lru_pvec = &lru_pvec;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1))
+ iov_base = written;
+ else
+ filemap_set_next_iovec(&cur_iov, &iov_base, written);
+
+ /*
+ * if file system implements batch_write method - use it,
+ * otherwise - use generic_batch_write
+ */
+ if (file->f_op->batch_write)
+ batch_write = file->f_op->batch_write;
+ else
+ batch_write = generic_file_batch_write;
+
+ do {
+ /* do not walk over current segment */
+ desc.buf = cur_iov->iov_base + iov_base;
+ desc.count = cur_iov->iov_len - iov_base;
+ if (desc.count > 0)
+ status = batch_write(file, &desc, &copied);
+ else {
+ copied = 0;
+ status = 0;
+ }
+ if (likely(copied >= 0)) {
+ written += copied;
+ count -= copied;
+ if (count) {
+ /*
+ * not everything is written yet. Adjust write
+ * descriptor for next iteration
+ */
+ desc.pos += copied;
+ if (likely(nr_segs == 1))
+ iov_base += copied;
+ else
filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
+ &iov_base,
+ copied);
}
}
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
- if (status < 0)
- break;
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- } while (count);
- *ppos = pos;
-
- if (cached_page)
- page_cache_release(cached_page);
+ if (status < 0)
+ break;
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ } while (count);
+ *ppos = pos + written;
+
+ if (desc.cached_page)
+ page_cache_release(desc.cached_page);
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
diff -puN Documentation/filesystems/vfs.txt~batched-write Documentation/filesystems/vfs.txt
--- linux-2.6.17-mm5/Documentation/filesystems/vfs.txt~batched-write 2006-07-04 14:34:53.000000000 +0400
+++ linux-2.6.17-mm5-vs/Documentation/filesystems/vfs.txt 2006-07-04 14:37:16.000000000 +0400
@@ -717,6 +717,8 @@ struct file_operations {
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ long (*batch_write)(struct file *file, struct write_descriptor *desc,
+ size_t *written);
ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
@@ -784,6 +786,8 @@ otherwise noted.
writev: called by the writev(2) system call
+ batch_write: optional, if implemented called by writev(2) and write(2)
+
sendfile: called by the sendfile(2) system call
get_unmapped_area: called by the mmap(2) system call
_
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 11:12 ` Vladimir V. Saveliev
@ 2006-07-04 11:21 ` Vladimir V. Saveliev
2006-07-04 12:07 ` Pekka Enberg
2006-07-04 11:48 ` Christoph Hellwig
1 sibling, 1 reply; 20+ messages in thread
From: Vladimir V. Saveliev @ 2006-07-04 11:21 UTC (permalink / raw)
To: Andrew Morton; +Cc: lkml, reiserfs-dev
[-- Attachment #1: Type: text/plain, Size: 11732 bytes --]
Hello
Sorry, previous mail contained 2 patches incorrectly inlined. This time
everything should be correct.
On Tue, 2006-07-04 at 15:12 +0400, Vladimir V. Saveliev wrote:
> Hello
>
> Andrew, thanks a lot for your improvements.
> Here is new version of the patch.
>
> On Thu, 2006-06-29 at 18:50 -0700, Andrew Morton wrote:
> > On Thu, 29 Jun 2006 12:17:36 -0700
> > Hans Reiser <reiser@namesys.com> wrote:
> > >
> > >
> > > This patch adds a method batch_write to struct address_space_operations.
> > > A filesystem may want to implement this operation to improve write performance.
> > > Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
> > > it writes one page using prepare_write and commit_write address space operations.
> > >
> >
> > Please remember to fill in the authorship and signed-off-by info.
> >
>
> done.
>
> > The approach seems sane.
> >
> > >
> > >
> > >
> > > diff -puN include/linux/fs.h~batched-write include/linux/fs.h
> > > --- linux-2.6.17-mm3/include/linux/fs.h~batched-write 2006-06-28 21:39:27.000000000 +0400
> > > +++ linux-2.6.17-mm3-root/include/linux/fs.h 2006-06-28 21:39:27.000000000 +0400
> > > @@ -346,6 +346,39 @@ enum positive_aop_returns {
> > > struct page;
> > > struct address_space;
> > > struct writeback_control;
> > > +typedef struct write_descriptor write_descriptor_t;
> >
> > Let's just use `struct write descriptor' throughout please.
> >
>
> done.
>
> > > +
> > > +/*
> > > + * This is "write_actor" function type, used by write() to copy data from user
> > > + * space. There are two functions of this type: write_actor and
> > > + * write_iovec_actor. First is used when user data are in one segment, second
> > > + * is used in case of vectored write.
> > > + */
> > > +typedef size_t (*write_actor_t)(struct page *, unsigned long, size_t,
> > > + const write_descriptor_t *);
> >
> > But yes, we do use typedefs for these things - they're such a mouthful.
> >
>
> As long as now we never call batch_write to write from more than
> segment - we do no need write_actor. filemap_copy_from_user is only used
> to copy data.
>
> > Can we please fill in the variable names in declarations such as this? You
> > look at it and wonder what that `unsigned long' and `size_t' actually _do_.
> > It's so much clearer if the names are filled in.
> >
> > > +/**
> > > + * struct write_descriptor - set of write arguments
> > > + * @pos: offset from the start of the file to write to
> > > + * @count: number of bytes to write
> > > + * @cur_iov: current i/o vector
> > > + * @iov_off: offset within current i/o vector
> > > + * @buf: user data
> > > + * @actor: function to copy user data with
> > > + *
> > > + * This structure is to pass to batch_write address space operation all
> > > + * information which is needed to continue write.
> > > + */
> > > +struct write_descriptor {
> > > + loff_t pos;
> > > + size_t count;
> > > + const struct iovec *cur_iov;
> > > + size_t iov_off;
> > > + char __user *buf;
> > > + write_actor_t actor;
> > > +};
> >
> > Boy, it's complex, isn't it? Not your fault though.
> >
>
> struct write_descriptor changed. It has 5 fields now.
>
> > > +#include <linux/pagevec.h>
> >
> > A simple declaration:
> >
> > struct pagevec;
> >
> > would suffice here. Please place it right at the top-of-file, with the others.
> >
>
> done.
>
> > > struct address_space_operations {
> > > int (*writepage)(struct page *page, struct writeback_control *wbc);
> > > @@ -367,6 +400,8 @@ struct address_space_operations {
> > > */
> > > int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
> > > int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
> > > + long (*batch_write)(struct file *, const write_descriptor_t *,
> > > + struct pagevec *, struct page **, size_t *);
> >
> > It's nice to fill in the names here too, IMO. Yes, there are past sins
> > there.
> >
>
> done.
>
> > Should this be an address_space_operation or a file_operation?
> >
>
> I was seeking to be minimal in my changes to the philosophy of the code.
> So, it was an address_space operation. Now it is a file operation.
>
>
> > > +/**
> > > + * generic_batch_write - generic implementation of batched write
> >
> > gneric_file_batch_write(), perhaps? I guess that depends upon the
> > answer to the previous question.
> >
>
> renamed.
>
> > > + * @file: the file to write to
> > > + * @desc: set of write arguments
> > > + * @lru_pvec: multipage container to batch adding pages to LRU list
> > > + * @cached_page: allocated but not used on previous call
> > > + * @written: returned number of bytes successfully written
> > > + *
> > > + * This implementation of batch_write method writes not more than one page of
> > > + * file. It faults in user space, allocates page and calls prepare_write and
> > > + * commit_write address space operations. User data are copied by an actor
> > > + * which is set by caller depending on whether write or writev is on the way.
> > > + */
> > > +static long generic_batch_write(struct file *file,
> > > + const write_descriptor_t *desc,
> > > + struct pagevec *lru_pvec,
> > > + struct page **cached_page, size_t *written)
> >
> > I wonder if we really need that cached_page thing.
> >
> > Did you consider putting more of these arguments into the (now non-const)
> > `struct write_descriptor'? It'd be sligtly more efficient and would save
> > stack on some of our stack-hungriest codepaths.
> >
>
> lru_pvec and cached_page are included to struct write_descriptor.
>
> > > +{
> > > + const struct address_space_operations *a_ops = file->f_mapping->a_ops;
> > > + struct page *page;
> > > + unsigned long index;
> > > + size_t bytes;
> > > + unsigned long offset;
> > > + long status;
> > > +
> > > + /* offset within page write is to start at */
> > > + offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
> > >
> > > - do {
> > > - unsigned long index;
> > > - unsigned long offset;
> > > - size_t copied;
> > > -
> > > - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
> > > - index = pos >> PAGE_CACHE_SHIFT;
> > > - bytes = PAGE_CACHE_SIZE - offset;
> > > + /* index of page we are to write to */
> > > + index = desc->pos >> PAGE_CACHE_SHIFT;
> > >
> > > - /* Limit the size of the copy to the caller's write size */
> > > - bytes = min(bytes, count);
> > > + /* number of bytes which can be written to the page */
> > > + bytes = PAGE_CACHE_SIZE - offset;
> > >
> > > - /*
> > > - * Limit the size of the copy to that of the current segment,
> > > - * because fault_in_pages_readable() doesn't know how to walk
> > > - * segments.
> > > - */
> > > - bytes = min(bytes, cur_iov->iov_len - iov_base);
> > > + /* Limit the size of the copy to the caller's write size */
> > > + bytes = min(bytes, desc->count);
> > >
> > > + /*
> > > + * Limit the size of the copy to that of the current segment,
> > > + * because fault_in_pages_readable() doesn't know how to walk
> > > + * segments.
> > > + */
> > > + bytes = min(bytes, desc->cur_iov->iov_len - desc->iov_off);
> > > +
> > > + while (1) {
> > > /*
> > > * Bring in the user page that we will copy from _first_.
> > > * Otherwise there's a nasty deadlock on copying from the
> > > * same page as we're writing to, without it being marked
> > > * up-to-date.
> > > */
> > > - fault_in_pages_readable(buf, bytes);
> > > + fault_in_pages_readable(desc->buf, bytes);
> > >
> > > - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
> > > - if (!page) {
> > > - status = -ENOMEM;
> > > - break;
> > > - }
> > > + page = __grab_cache_page(file->f_mapping, index, cached_page,
> > > + lru_pvec);
> > > + if (!page)
> > > + return -ENOMEM;
> > >
> > > - status = a_ops->prepare_write(file, page, offset, offset+bytes);
> > > + status = a_ops->prepare_write(file, page, offset,
> > > + offset+bytes);
> >
> > The code you're patching here changed this morning, and it'll change again
> > (a little bit) in the next few days.
> >
>
> New patch is against 2.6.17-mm5.
>
> > And Neil has suggested some simplifications to filemap_set_next_iovec(),
> > although I don't think anyone's working on that at present (everyone's
> > justifiably afraid to touch this code).
> > > +
> > > +ssize_t
> > > +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> > > + unsigned long nr_segs, loff_t pos, loff_t *ppos,
> > > + size_t count, ssize_t written)
> > > +{
> > > + struct file *file = iocb->ki_filp;
> > > + struct address_space * mapping = file->f_mapping;
> > > + const struct address_space_operations *a_ops = mapping->a_ops;
> > > + struct inode *inode = mapping->host;
> > > + long status;
> > > + struct page *cached_page = NULL;
> > > + struct pagevec lru_pvec;
> > > + write_descriptor_t desc;
> > > + size_t copied = 0;
> > > +
> > > + pagevec_init(&lru_pvec, 0);
> > > +
> > > + /*
> > > + * initialize write descriptor fields: position to write to
> > > + * and number of bytes to write
> > > + */
> > > + desc.pos = pos;
> > > + desc.count = count;
> > > +
> > > + /*
> > > + * handle partial DIO write. Adjust cur_iov if needed.
> > > + */
> > > + if (likely(nr_segs == 1)) {
> > > + desc.cur_iov = iov;
> > > + desc.iov_off = written;
> > > + desc.actor = write_actor;
> > > + } else {
> > > + filemap_set_next_iovec(&desc.cur_iov, &desc.iov_off, written);
> > > + desc.actor = write_iovec_actor;
> > > + }
> > > + /* pointer to user buffer */
> > > + desc.buf = desc.cur_iov->iov_base + desc.iov_off;
> > > +
> > > + do {
> > > + /*
> > > + * When calling the filesystem for writes, there is processing
> > > + * that must be done:
> > > + * 1) per word
> > > + * 2) per page
> > > + * 3) per call to the FS
> > > + * If the FS is called per page, then it turns out that 3)
> > > + * costs more than 1) and 2) for sophisticated filesystems. To
> > > + * allow the FS to choose to pay the cost of 3) only once we
> > > + * call batch_write, if the FS supports it.
> > > + */
> >
> > I think it's better that the more philosophical comments like this go in
> > the comment block at the start of the function.
> >
>
> This comment is moved at the start of this function.
>
> > > + if (a_ops->batch_write)
> > > + status = a_ops->batch_write(file, &desc, &lru_pvec,
> > > + &cached_page, &copied);
> > > + else
> > > + status = generic_batch_write(file, &desc, &lru_pvec,
> > > + &cached_page, &copied);
> > > + if (likely(copied > 0)) {
> >
> > This is missing this morning's handle-zero-length-segment bugfix.
> >
>
> I hope new patch deals properly with zero length segments. I writev-ed a
> vector which had 3 zero length segments: first, last and 3rd.
>
> > > + written += copied;
> > > + desc.count -= copied;
> > > + if (desc.count) {
> > > + /*
> > > + * not everything is written yet. Adjust write
> > > + * descriptor for next iteration
> > > + */
> > > + desc.pos += copied;
> > > + if (unlikely(nr_segs > 1))
> > > + filemap_set_next_iovec(&desc.cur_iov,
> > > + &desc.iov_off,
> > > + copied);
> > > + else
> > > + desc.iov_off += copied;
> > > + desc.buf = desc.cur_iov->iov_base +
> > > + desc.iov_off;
> > > + }
> > > + }
> > > if (status < 0)
> > > break;
> > > balance_dirty_pages_ratelimited(mapping);
> > > cond_resched();
> > > - } while (count);
> > > - *ppos = pos;
> > > + } while (desc.count);
> > > + *ppos = pos + written;
> > >
> > > if (cached_page)
> > > page_cache_release(cached_page);
> >
[-- Attachment #2: batched-write.patch --]
[-- Type: message/rfc822, Size: 12527 bytes --]
From:
Subject: No Subject
Date: Tue, 04 Jul 2006 15:21:57 +0400
Message-ID: <1152012117.6454.42.camel@tribesman.namesys.com>
From: Vladimir Saveliev <vs@namesys.com>
This patch adds a method batch_write to struct file_operations.
A filesystem may want to implement this operation to improve write performance.
Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
it writes one page using prepare_write and commit_write address space operations.
Signed-off-by: Vladimir Saveliev <vs@namesys.com>
diff -puN include/linux/fs.h~batched-write include/linux/fs.h
--- linux-2.6.17-mm5/include/linux/fs.h~batched-write 2006-07-04 14:07:47.000000000 +0400
+++ linux-2.6.17-mm5-vs/include/linux/fs.h 2006-07-04 14:39:11.000000000 +0400
@@ -246,6 +246,7 @@ struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
+struct pagevec;
extern void __init inode_init(unsigned long);
extern void __init inode_init_early(void);
@@ -1096,6 +1097,25 @@ typedef int (*read_actor_t)(read_descrip
#define HAVE_COMPAT_IOCTL 1
#define HAVE_UNLOCKED_IOCTL 1
+/**
+ * struct write_descriptor - set of write arguments
+ * @pos: offset from the start of the file to write to
+ * @count: number of bytes to write
+ * @buf: pointer to data to be written
+ * @lru_pvec: multipage container to batch adding pages to LRU list
+ * @cached_page: allocated but not used on previous call
+ *
+ * This structure is to pass to batch_write file operation all
+ * information which is needed to continue write.
+ */
+struct write_descriptor {
+ loff_t pos;
+ size_t count;
+ char __user *buf;
+ struct page *cached_page;
+ struct pagevec *lru_pvec;
+};
+
/*
* NOTE:
* read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
@@ -1123,6 +1143,8 @@ struct file_operations {
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ long (*batch_write)(struct file *file, struct write_descriptor *desc,
+ size_t *written);
ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
diff -puN mm/filemap.c~batched-write mm/filemap.c
--- linux-2.6.17-mm5/mm/filemap.c~batched-write 2006-07-04 14:07:47.000000000 +0400
+++ linux-2.6.17-mm5-vs/mm/filemap.c 2006-07-04 15:09:58.000000000 +0400
@@ -2159,78 +2159,59 @@ generic_file_direct_write(struct kiocb *
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/**
+ * generic_file_batch_write - generic batch_write file operation
+ * @file: the file to write to
+ * @desc: set of write arguments
+ * @written: returned number of bytes successfully written
+ *
+ * This implementation of batch_write file operation method writes not
+ * more than one page of file. It faults in user space, allocates page
+ * and calls prepare_write and commit_write address space
+ * operations. User data are copied by filemap_copy_from_user.
+ */
+static long generic_file_batch_write(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
-
- pagevec_init(&lru_pvec, 0);
-
- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
-
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
-
- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
-
- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ const struct address_space_operations *a_ops = file->f_mapping->a_ops;
+ struct page *page;
+ unsigned long index;
+ size_t bytes;
+ unsigned long offset;
+ long status;
+
+ /* offset within page write is to start at */
+ offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
+
+ /* index of page we are to write to */
+ index = desc->pos >> PAGE_CACHE_SHIFT;
+
+ /* number of bytes which can be written to the page */
+ bytes = PAGE_CACHE_SIZE - offset;
+
+ /* limit the size of the copy to the caller's write size */
+ bytes = min(bytes, desc->count);
+ BUG_ON(bytes == 0);
+ while (1) {
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
+ fault_in_pages_readable(desc->buf, bytes);
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
+ page = __grab_cache_page(file->f_mapping, index,
+ &desc->cached_page, desc->lru_pvec);
+ if (!page)
+ return -ENOMEM;
- if (unlikely(bytes == 0)) {
- status = 0;
- copied = 0;
- goto zero_length_segment;
- }
-
- status = a_ops->prepare_write(file, page, offset, offset+bytes);
+ status = a_ops->prepare_write(file, page, offset,
+ offset+bytes);
if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(file->f_mapping->host);
if (status != AOP_TRUNCATED_PAGE)
unlock_page(page);
@@ -2238,61 +2219,139 @@ generic_file_buffered_write(struct kiocb
if (status == AOP_TRUNCATED_PAGE)
continue;
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * prepare_write() may have instantiated a few
+ * blocks outside i_size. Trim these off
+ * again.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ if (desc->pos + bytes > isize)
+ vmtruncate(file->f_mapping->host, isize);
+ return status;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+
+ /*
+ * call write actor in order to copy user data to the
+ * page
+ */
+ *written = filemap_copy_from_user(page, offset, desc->buf,
+ bytes);
+
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (status == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
continue;
}
-zero_length_segment:
- if (likely(copied >= 0)) {
- if (!status)
- status = copied;
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ break;
+ }
+ /*
+ * If commit_write returned error - write failed and we zero
+ * number of written bytes. If filemap_copy_from_user copied
+ * less than it was asked to we return -EFAULT and number of
+ * bytes actually written.
+ */
+ if (status)
+ *written = 0;
+ else if (*written != bytes)
+ status = -EFAULT;
+ return status;
+}
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
+/*
+ * When calling the filesystem for writes, there is processing
+ * that must be done:
+ * 1) per word
+ * 2) per page
+ * 3) per call to the FS
+ * If the FS is called per page, then it turns out that 3) costs more
+ * than 1) and 2) for sophisticated filesystems. To allow the FS to
+ * choose to pay the cost of 3) only once we call batch_write, if the
+ * FS supports it.
+ */
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status;
+ struct pagevec lru_pvec;
+ struct write_descriptor desc;
+ size_t copied = 0;
+ const struct iovec *cur_iov = iov; /* current iovec */
+ size_t iov_base = 0; /* offset in the current iovec */
+ long (*batch_write)(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written);
+
+ pagevec_init(&lru_pvec, 0);
+
+ /*
+ * initialize write descriptor fields: position to write to
+ * and number of bytes to write
+ */
+ desc.pos = pos;
+ desc.cached_page = NULL;
+ desc.lru_pvec = &lru_pvec;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1))
+ iov_base = written;
+ else
+ filemap_set_next_iovec(&cur_iov, &iov_base, written);
+
+ /*
+ * if file system implements batch_write method - use it,
+ * otherwise - use generic_batch_write
+ */
+ if (file->f_op->batch_write)
+ batch_write = file->f_op->batch_write;
+ else
+ batch_write = generic_file_batch_write;
+
+ do {
+ /* do not walk over current segment */
+ desc.buf = cur_iov->iov_base + iov_base;
+ desc.count = cur_iov->iov_len - iov_base;
+ if (desc.count > 0)
+ status = batch_write(file, &desc, &copied);
+ else {
+ copied = 0;
+ status = 0;
+ }
+ if (likely(copied >= 0)) {
+ written += copied;
+ count -= copied;
+ if (count) {
+ /*
+ * not everything is written yet. Adjust write
+ * descriptor for next iteration
+ */
+ desc.pos += copied;
+ if (likely(nr_segs == 1))
+ iov_base += copied;
+ else
filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
+ &iov_base,
+ copied);
}
}
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
- if (status < 0)
- break;
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- } while (count);
- *ppos = pos;
-
- if (cached_page)
- page_cache_release(cached_page);
+ if (status < 0)
+ break;
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ } while (count);
+ *ppos = pos + written;
+
+ if (desc.cached_page)
+ page_cache_release(desc.cached_page);
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
diff -puN Documentation/filesystems/vfs.txt~batched-write Documentation/filesystems/vfs.txt
--- linux-2.6.17-mm5/Documentation/filesystems/vfs.txt~batched-write 2006-07-04 14:34:53.000000000 +0400
+++ linux-2.6.17-mm5-vs/Documentation/filesystems/vfs.txt 2006-07-04 14:37:16.000000000 +0400
@@ -717,6 +717,8 @@ struct file_operations {
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ long (*batch_write)(struct file *file, struct write_descriptor *desc,
+ size_t *written);
ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
@@ -784,6 +786,8 @@ otherwise noted.
writev: called by the writev(2) system call
+ batch_write: optional, if implemented called by writev(2) and write(2)
+
sendfile: called by the sendfile(2) system call
get_unmapped_area: called by the mmap(2) system call
_
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 11:12 ` Vladimir V. Saveliev
2006-07-04 11:21 ` Vladimir V. Saveliev
@ 2006-07-04 11:48 ` Christoph Hellwig
2006-07-04 17:44 ` Hans Reiser
2006-07-05 5:06 ` Alexander Zarochentsev
1 sibling, 2 replies; 20+ messages in thread
From: Christoph Hellwig @ 2006-07-04 11:48 UTC (permalink / raw)
To: Vladimir V. Saveliev; +Cc: Andrew Morton, lkml, reiserfs-dev
On Tue, Jul 04, 2006 at 03:12:56PM +0400, Vladimir V. Saveliev wrote:
>
> > Should this be an address_space_operation or a file_operation?
> >
>
> I was seeking to be minimal in my changes to the philosophy of the code.
> So, it was an address_space operation. Now it is a file operation.
It definitly should not be a file_operation! It works at the address_space
not the much higher file level. Maybe all three should become callbacks
for the generic write routines, but that's left for the future.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 11:21 ` Vladimir V. Saveliev
@ 2006-07-04 12:07 ` Pekka Enberg
2006-07-04 18:43 ` Hans Reiser
0 siblings, 1 reply; 20+ messages in thread
From: Pekka Enberg @ 2006-07-04 12:07 UTC (permalink / raw)
To: Vladimir V. Saveliev; +Cc: Andrew Morton, lkml, reiserfs-dev
On 7/4/06, Vladimir V. Saveliev <vs@namesys.com> wrote:
> @@ -784,6 +786,8 @@ otherwise noted.
>
> writev: called by the writev(2) system call
>
> + batch_write: optional, if implemented called by writev(2) and write(2)
> +
It'd be nice if you added some explanation here why a filesystem
developer would want to implement it.
Pekka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 11:48 ` Christoph Hellwig
@ 2006-07-04 17:44 ` Hans Reiser
2006-07-04 22:18 ` Andrew Morton
2006-07-05 5:06 ` Alexander Zarochentsev
1 sibling, 1 reply; 20+ messages in thread
From: Hans Reiser @ 2006-07-04 17:44 UTC (permalink / raw)
To: Christoph Hellwig, Andrew Morton; +Cc: Vladimir V. Saveliev, lkml, reiserfs-dev
Christoph Hellwig wrote:
>On Tue, Jul 04, 2006 at 03:12:56PM +0400, Vladimir V. Saveliev wrote:
>
>
>>>Should this be an address_space_operation or a file_operation?
>>>
>>>
>>>
>>I was seeking to be minimal in my changes to the philosophy of the code.
>>So, it was an address_space operation. Now it is a file operation.
>>
>>
>
>It definitly should not be a file_operation! It works at the address_space
>not the much higher file level. Maybe all three should become callbacks
>for the generic write routines, but that's left for the future.
>
>
>
>
I don't have a commitment to one way or the other, probably because
there are some things that are unclear in my mind. Could you help me
with them? Can you define what is the address space vs. the file level
please? It is odd to be asking such a basic question, but these things
are genuinely unclear to me. If the use of something varies according
to the file, is it a file method? What things vary according to address
space and not according to file? Should things that vary according to
address space be address space ops and things that vary according to
file be file ops? If that logic seems valid, should a lot more be changed?
Oh, and Andrew, while such things are discussed, could you just pick one
way or the other and let the patch go in?
Hans
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 12:07 ` Pekka Enberg
@ 2006-07-04 18:43 ` Hans Reiser
0 siblings, 0 replies; 20+ messages in thread
From: Hans Reiser @ 2006-07-04 18:43 UTC (permalink / raw)
To: Pekka Enberg; +Cc: Vladimir V. Saveliev, Andrew Morton, lkml, reiserfs-dev
Pekka Enberg wrote:
> On 7/4/06, Vladimir V. Saveliev <vs@namesys.com> wrote:
>
>> @@ -784,6 +786,8 @@ otherwise noted.
>>
>> writev: called by the writev(2) system call
>>
>> + batch_write: optional, if implemented called by writev(2) and
>> write(2)
>> +
>
>
> It'd be nice if you added some explanation here why a filesystem
> developer would want to implement it.
>
> Pekka
>
>
Vladimir, it sounds like he found another place you might insert this
comment:
+/*
+ * When calling the filesystem for writes, there is processing
+ * that must be done:
+ * 1) per word
+ * 2) per page
+ * 3) per call to the FS
+ * If the FS is called per page, then it turns out that 3) costs more
+ * than 1) and 2) for sophisticated filesystems. To allow the FS to
+ * choose to pay the cost of 3) only once we call batch_write, if the
+ * FS supports it.
+ */
Thanks Pekka!
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 17:44 ` Hans Reiser
@ 2006-07-04 22:18 ` Andrew Morton
2006-07-04 22:25 ` Hans Reiser
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Andrew Morton @ 2006-07-04 22:18 UTC (permalink / raw)
To: Hans Reiser; +Cc: hch, vs, Linux-Kernel, reiserfs-dev
On Tue, 04 Jul 2006 10:44:13 -0700
Hans Reiser <reiser@namesys.com> wrote:
> Christoph Hellwig wrote:
>
> >On Tue, Jul 04, 2006 at 03:12:56PM +0400, Vladimir V. Saveliev wrote:
> >
> >
> >>>Should this be an address_space_operation or a file_operation?
> >>>
> >>>
> >>>
> >>I was seeking to be minimal in my changes to the philosophy of the code.
> >>So, it was an address_space operation. Now it is a file operation.
> >>
> >>
> >
> >It definitly should not be a file_operation! It works at the address_space
> >not the much higher file level. Maybe all three should become callbacks
> >for the generic write routines, but that's left for the future.
> >
> >
> >
> >
> I don't have a commitment to one way or the other, probably because
> there are some things that are unclear in my mind. Could you help me
> with them? Can you define what is the address space vs. the file level
> please? It is odd to be asking such a basic question, but these things
> are genuinely unclear to me. If the use of something varies according
> to the file, is it a file method? What things vary according to address
> space and not according to file? Should things that vary according to
> address space be address space ops and things that vary according to
> file be file ops? If that logic seems valid, should a lot more be changed?
>
> Oh, and Andrew, while such things are discussed, could you just pick one
> way or the other and let the patch go in?
>
I wasn't sure, which was I asked rather than suggested..
Looking closer, yes I agree with Christoph, sorry. It's called at the same
level as ->prepare_write/commit_write so if there's any logic to this, it's
logical that batched_write be an a_op too.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 22:18 ` Andrew Morton
@ 2006-07-04 22:25 ` Hans Reiser
2006-07-04 22:39 ` Hans Reiser
2006-07-05 16:45 ` Vladimir V. Saveliev
2 siblings, 0 replies; 20+ messages in thread
From: Hans Reiser @ 2006-07-04 22:25 UTC (permalink / raw)
To: Andrew Morton; +Cc: hch, vs, Linux-Kernel, reiserfs-dev
Andrew Morton wrote:
>
> It's called at the same
>level as ->prepare_write/commit_write so if there's any logic to this, it's
>logical that batched_write be an a_op too.
>
>
I agree that prepare_write/commit_write should be the same as
batch_write, but maybe they all should be file ops?
vs will send a patch which makes batch_write an a_op tomorrow.....
Hans
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 22:18 ` Andrew Morton
2006-07-04 22:25 ` Hans Reiser
@ 2006-07-04 22:39 ` Hans Reiser
2006-07-05 16:45 ` Vladimir V. Saveliev
2 siblings, 0 replies; 20+ messages in thread
From: Hans Reiser @ 2006-07-04 22:39 UTC (permalink / raw)
To: Andrew Morton; +Cc: hch, vs, Linux-Kernel, reiserfs-dev
Andrew, I just wanted to thank you for all the detailed comments you
made. As I have said in the past, any time your schedule might allow
you for reading and criticizing reiser4 is a gift we will be grateful
for. I have the greatest respect for the code you write.
Hans
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 11:48 ` Christoph Hellwig
2006-07-04 17:44 ` Hans Reiser
@ 2006-07-05 5:06 ` Alexander Zarochentsev
1 sibling, 0 replies; 20+ messages in thread
From: Alexander Zarochentsev @ 2006-07-05 5:06 UTC (permalink / raw)
To: reiserfs-dev; +Cc: Christoph Hellwig, Vladimir V. Saveliev, Andrew Morton, lkml
On Tuesday 04 July 2006 15:48, Christoph Hellwig wrote:
> On Tue, Jul 04, 2006 at 03:12:56PM +0400, Vladimir V. Saveliev wrote:
> > > Should this be an address_space_operation or a file_operation?
> >
> > I was seeking to be minimal in my changes to the philosophy of the
> > code. So, it was an address_space operation. Now it is a file
> > operation.
>
> It definitly should not be a file_operation!
> It works at the
> address_space
generic_batch_write works with the page cache, another batch_write
implementation may not.
Except the cached_page and pagevec which are generic_batch_write
context, the batch_write switch leaves file stuff _before_ the switch
and using of the page cache _after_ the switch. It gives much
flexibility to a file system to choose between simple page cache
buffered write, batch version of page cache write or even not a
page-oriented write method like reiser4 write method for packed (tails
only) files. address space op which does not use the page cache looks
better as a file op.
> not the much higher file level.
> Maybe all three should
> become callbacks for the generic write routines, but that's left for
> the future.
Thanks,
Alex.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-04 22:18 ` Andrew Morton
2006-07-04 22:25 ` Hans Reiser
2006-07-04 22:39 ` Hans Reiser
@ 2006-07-05 16:45 ` Vladimir V. Saveliev
2006-07-05 19:26 ` Andrew Morton
2006-07-09 20:11 ` Christoph Hellwig
2 siblings, 2 replies; 20+ messages in thread
From: Vladimir V. Saveliev @ 2006-07-05 16:45 UTC (permalink / raw)
To: Andrew Morton; +Cc: Hans Reiser, hch, Linux-Kernel, reiserfs-dev
[-- Attachment #1: Type: text/plain, Size: 1974 bytes --]
Hello
On Tue, 2006-07-04 at 15:18 -0700, Andrew Morton wrote:
> On Tue, 04 Jul 2006 10:44:13 -0700
> Hans Reiser <reiser@namesys.com> wrote:
>
> > Christoph Hellwig wrote:
> >
> > >On Tue, Jul 04, 2006 at 03:12:56PM +0400, Vladimir V. Saveliev wrote:
> > >
> > >
> > >>>Should this be an address_space_operation or a file_operation?
> > >>>
> > >>>
> > >>>
> > >>I was seeking to be minimal in my changes to the philosophy of the code.
> > >>So, it was an address_space operation. Now it is a file operation.
> > >>
> > >>
> > >
> > >It definitly should not be a file_operation! It works at the address_space
> > >not the much higher file level. Maybe all three should become callbacks
> > >for the generic write routines, but that's left for the future.
> > >
> > >
> > >
> > >
> > I don't have a commitment to one way or the other, probably because
> > there are some things that are unclear in my mind. Could you help me
> > with them? Can you define what is the address space vs. the file level
> > please? It is odd to be asking such a basic question, but these things
> > are genuinely unclear to me. If the use of something varies according
> > to the file, is it a file method? What things vary according to address
> > space and not according to file? Should things that vary according to
> > address space be address space ops and things that vary according to
> > file be file ops? If that logic seems valid, should a lot more be changed?
> >
> > Oh, and Andrew, while such things are discussed, could you just pick one
> > way or the other and let the patch go in?
> >
>
> I wasn't sure, which was I asked rather than suggested..
>
> Looking closer, yes I agree with Christoph, sorry. It's called at the same
> level as ->prepare_write/commit_write so if there's any logic to this, it's
> logical that batched_write be an a_op too.
>
ok, the attached is the final version of the patch.
Please, take a look and make comments.
>
[-- Attachment #2: batched-write.patch --]
[-- Type: message/rfc822, Size: 12318 bytes --]
From:
Subject: No Subject
Date: Wed, 05 Jul 2006 20:45:18 +0400
Message-ID: <1152117918.6337.47.camel@tribesman.namesys.com>
From: Vladimir Saveliev <vs@namesys.com>
This patch adds a method batch_write to struct address_space_operations.
A filesystem may want to implement this operation to improve write performance.
Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
it writes one page using prepare_write and commit_write address space operations.
Signed-off-by: Vladimir Saveliev <vs@namesys.com>
diff -puN include/linux/fs.h~batched-write include/linux/fs.h
--- linux-2.6.17-mm5/include/linux/fs.h~batched-write 2006-07-05 13:23:14.000000000 +0400
+++ linux-2.6.17-mm5-vs/include/linux/fs.h 2006-07-05 13:31:46.000000000 +0400
@@ -246,6 +246,7 @@ struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
+struct pagevec;
extern void __init inode_init(unsigned long);
extern void __init inode_init_early(void);
@@ -347,6 +348,25 @@ struct page;
struct address_space;
struct writeback_control;
+/**
+ * struct write_descriptor - set of write arguments
+ * @pos: offset from the start of the file to write to
+ * @count: number of bytes to write
+ * @buf: pointer to data to be written
+ * @lru_pvec: multipage container to batch adding pages to LRU list
+ * @cached_page: allocated but not used on previous call
+ *
+ * This structure is to pass to batch_write file operation all
+ * information which is needed to continue write.
+ */
+struct write_descriptor {
+ loff_t pos;
+ size_t count;
+ char __user *buf;
+ struct page *cached_page;
+ struct pagevec *lru_pvec;
+};
+
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
int (*readpage)(struct file *, struct page *);
@@ -367,6 +387,8 @@ struct address_space_operations {
*/
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ long (*batch_write)(struct file *file, struct write_descriptor *desc,
+ size_t *written);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
diff -puN mm/filemap.c~batched-write mm/filemap.c
--- linux-2.6.17-mm5/mm/filemap.c~batched-write 2006-07-05 13:23:14.000000000 +0400
+++ linux-2.6.17-mm5-vs/mm/filemap.c 2006-07-05 13:47:58.000000000 +0400
@@ -2159,78 +2159,59 @@ generic_file_direct_write(struct kiocb *
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+/**
+ * generic_batch_write - generic batch_write address space operation
+ * @file: the file to write to
+ * @desc: set of write arguments
+ * @written: returned number of bytes successfully written
+ *
+ * This implementation of batch_write address space operation writes not more
+ * than one page of file. It faults in user space, allocates page and calls
+ * prepare_write and commit_write address space operations. User data are
+ * copied by filemap_copy_from_user.
+ */
+static long generic_batch_write(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
-
- pagevec_init(&lru_pvec, 0);
-
- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
-
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
-
- /* Limit the size of the copy to the caller's write size */
- bytes = min(bytes, count);
-
- /*
- * Limit the size of the copy to that of the current segment,
- * because fault_in_pages_readable() doesn't know how to walk
- * segments.
- */
- bytes = min(bytes, cur_iov->iov_len - iov_base);
+ const struct address_space_operations *a_ops = file->f_mapping->a_ops;
+ struct page *page;
+ unsigned long index;
+ size_t bytes;
+ unsigned long offset;
+ long status;
+
+ /* offset within page write is to start at */
+ offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
+
+ /* index of page we are to write to */
+ index = desc->pos >> PAGE_CACHE_SHIFT;
+
+ /* number of bytes which can be written to the page */
+ bytes = PAGE_CACHE_SIZE - offset;
+
+ /* limit the size of the copy to the caller's write size */
+ bytes = min(bytes, desc->count);
+ BUG_ON(bytes == 0);
+ while (1) {
/*
* Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
+ * Otherwise there's a nasty deadlock on copying from the same
+ * page as we're writing to, without it being marked
* up-to-date.
*/
- fault_in_pages_readable(buf, bytes);
-
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
+ fault_in_pages_readable(desc->buf, bytes);
- if (unlikely(bytes == 0)) {
- status = 0;
- copied = 0;
- goto zero_length_segment;
- }
+ page = __grab_cache_page(file->f_mapping, index,
+ &desc->cached_page, desc->lru_pvec);
+ if (!page)
+ return -ENOMEM;
- status = a_ops->prepare_write(file, page, offset, offset+bytes);
+ status = a_ops->prepare_write(file, page, offset,
+ offset+bytes);
if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(file->f_mapping->host);
if (status != AOP_TRUNCATED_PAGE)
unlock_page(page);
@@ -2241,58 +2222,120 @@ generic_file_buffered_write(struct kiocb
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ if (desc->pos + bytes > isize)
+ vmtruncate(file->f_mapping->host, isize);
+ return status;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+
+ /* copy user data to the page */
+ *written = filemap_copy_from_user(page, offset, desc->buf,
+ bytes);
+
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (status == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
continue;
}
-zero_length_segment:
- if (likely(copied >= 0)) {
- if (!status)
- status = copied;
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ break;
+ }
+ /*
+ * If commit_write returned error - write failed and we zero number of
+ * written bytes. If filemap_copy_from_user copied less than it was
+ * asked to we return -EFAULT and number of bytes actually written.
+ */
+ if (status)
+ *written = 0;
+ else if (*written != bytes)
+ status = -EFAULT;
+ return status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status;
+ struct pagevec lru_pvec;
+ struct write_descriptor desc;
+ size_t copied = 0;
+ const struct iovec *cur_iov = iov; /* current iovec */
+ size_t iov_base = 0; /* offset in the current iovec */
+ long (*batch_write)(struct file *file,
+ struct write_descriptor *desc,
+ size_t *written);
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
+ pagevec_init(&lru_pvec, 0);
+
+ /*
+ * initialize write descriptor fields: position to write to
+ * and number of bytes to write
+ */
+ desc.pos = pos;
+ desc.cached_page = NULL;
+ desc.lru_pvec = &lru_pvec;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1))
+ iov_base = written;
+ else
+ filemap_set_next_iovec(&cur_iov, &iov_base, written);
+
+ /*
+ * if file system implements batch_write method - use it, otherwise -
+ * use generic_batch_write
+ */
+ if (a_ops->batch_write)
+ batch_write = a_ops->batch_write;
+ else
+ batch_write = generic_batch_write;
+
+ do {
+ /* do not walk over current segment */
+ desc.buf = cur_iov->iov_base + iov_base;
+ desc.count = cur_iov->iov_len - iov_base;
+ if (desc.count > 0)
+ status = batch_write(file, &desc, &copied);
+ else {
+ copied = 0;
+ status = 0;
+ }
+ if (likely(copied >= 0)) {
+ written += copied;
+ count -= copied;
+ if (count) {
+ /*
+ * not everything is written yet. Adjust write
+ * descriptor for next iteration
+ */
+ desc.pos += copied;
+ if (likely(nr_segs == 1))
+ iov_base += copied;
+ else
filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
+ &iov_base,
+ copied);
}
}
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
- if (status < 0)
- break;
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
- } while (count);
- *ppos = pos;
-
- if (cached_page)
- page_cache_release(cached_page);
+ if (status < 0)
+ break;
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ } while (count);
+ *ppos = pos + written;
+
+ if (desc.cached_page)
+ page_cache_release(desc.cached_page);
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
diff -puN Documentation/filesystems/vfs.txt~batched-write Documentation/filesystems/vfs.txt
--- linux-2.6.17-mm5/Documentation/filesystems/vfs.txt~batched-write 2006-07-05 13:23:14.000000000 +0400
+++ linux-2.6.17-mm5-vs/Documentation/filesystems/vfs.txt 2006-07-05 19:36:15.000000000 +0400
@@ -534,6 +534,8 @@ struct address_space_operations {
struct list_head *pages, unsigned nr_pages);
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ long (*batch_write)(struct file *file, struct write_descriptor *desc,
+ size_t *written);
sector_t (*bmap)(struct address_space *, sector_t);
int (*invalidatepage) (struct page *, unsigned long);
int (*releasepage) (struct page *, int);
@@ -624,6 +626,17 @@ struct address_space_operations {
operations. It should avoid returning an error if possible -
errors should have been handled by prepare_write.
+ batch_write: optional
+ When calling the filesystem for writes, there is processing
+ that must be done:
+ 1) per word
+ 2) per page
+ 3) per call to the FS
+ If the FS is called per page, then it turns out that 3) costs more
+ than 1) and 2) for sophisticated filesystems. To allow the FS to
+ choose to pay the cost of 3) only once we call batch_write, if the
+ FS supports it.
+
bmap: called by the VFS to map a logical block offset within object to
physical block number. This method is used by the FIBMAP
ioctl and for working with swap-files. To be able to swap to
_
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-05 16:45 ` Vladimir V. Saveliev
@ 2006-07-05 19:26 ` Andrew Morton
2006-07-06 4:58 ` Hans Reiser
2006-07-09 20:11 ` Christoph Hellwig
1 sibling, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2006-07-05 19:26 UTC (permalink / raw)
To: Vladimir V. Saveliev; +Cc: reiser, hch, Linux-Kernel, reiserfs-dev
"Vladimir V. Saveliev" <vs@namesys.com> wrote:
>
> This patch adds a method batch_write to struct address_space_operations.
> A filesystem may want to implement this operation to improve write performance.
I failed to make a record of which other filesystems will want to use this.
Do you recall?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-06-29 21:11 ` Chase Venters
2006-06-29 21:21 ` Chase Venters
@ 2006-07-06 4:45 ` Hans Reiser
1 sibling, 0 replies; 20+ messages in thread
From: Hans Reiser @ 2006-07-06 4:45 UTC (permalink / raw)
To: Chase Venters; +Cc: Andrew Morton, LKML
Chase, we incorporated your suggestion, and I want to thank you for it.
Hans
Chase Venters wrote:
> On Thu, 29 Jun 2006, Hans Reiser wrote:
>
>>
>> (patch was attached)
>>
>
> Not quoted because patch isn't inlined, but you're testing for the
> presence of the batch_write pointer repeatedly in the loop. How about
> declare a batch_write ptr on the stack and then do your test once,
> outside of your do { } while (count) loop, and then set it to the
> generic method (before entering the loop) if the generic method isn't
> available?
>
> Thanks,
> Chase
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-05 19:26 ` Andrew Morton
@ 2006-07-06 4:58 ` Hans Reiser
2006-07-10 21:12 ` Andreas Dilger
0 siblings, 1 reply; 20+ messages in thread
From: Hans Reiser @ 2006-07-06 4:58 UTC (permalink / raw)
To: Andrew Morton
Cc: Vladimir V. Saveliev, hch, Linux-Kernel, reiserfs-dev,
Andreas Dilger
Reiser4, FUSE, maybe XFS, and GFS2. Andreas, your support had
conditions, so I will not characterize for you whether you said Lustre
or you would like it. Andreas, could you look at the patch and see what
you think?
I think with some confidence that this interface will acquire more users
than those over time. It really is the right natural interface for most
filesystems that try to carefully optimize write performance.
Andrew Morton wrote:
>
>
>I failed to make a record of which other filesystems will want to use this.
>Do you recall?
>
>
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-05 16:45 ` Vladimir V. Saveliev
2006-07-05 19:26 ` Andrew Morton
@ 2006-07-09 20:11 ` Christoph Hellwig
1 sibling, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2006-07-09 20:11 UTC (permalink / raw)
To: Vladimir V. Saveliev
Cc: Andrew Morton, Hans Reiser, Linux-Kernel, reiserfs-dev, mm,
shaggy, nathans
On Wed, Jul 05, 2006 at 08:45:35PM +0400, Vladimir V. Saveliev wrote:
> ok, the attached is the final version of the patch.
> Please, take a look and make comments.
Perfect, this is exactly how I hoped it will be done in the end. Very
nice massaging of the generic code without runtime impact on older filesystems
or code duplication. Thanks a lot Vladimir!
Now before we can put this into mainline we'd need some in-tree filesystems
to make use of it. I've cc'ed the usual suspects for this..
p.s. you mail setup seems a little odd. any chance you could just inline
the patch normally instead of using mime?
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 1/2] batch-write.patch
2006-07-06 4:58 ` Hans Reiser
@ 2006-07-10 21:12 ` Andreas Dilger
0 siblings, 0 replies; 20+ messages in thread
From: Andreas Dilger @ 2006-07-10 21:12 UTC (permalink / raw)
To: Hans Reiser
Cc: Andrew Morton, Vladimir V. Saveliev, hch, Linux-Kernel,
reiserfs-dev
On Jul 05, 2006 21:58 -0700, Hans Reiser wrote:
> Reiser4, FUSE, maybe XFS, and GFS2. Andreas, your support had
> conditions, so I will not characterize for you whether you said Lustre
> or you would like it. Andreas, could you look at the patch and see what
> you think?
Lustre definitely wants to be able to aggregate IOs into larger chunks
wherever possible. Having interfaces that allow this instead of going
through page-at-a-time interfaces is definitely a win. As it stands
now we have to avoid the VFS or otherwise have back-channel information
in order to ensure that large IOs remain large.
Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2006-07-10 21:12 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-29 19:17 [PATCH 1/2] batch-write.patch Hans Reiser
2006-06-29 21:11 ` Chase Venters
2006-06-29 21:21 ` Chase Venters
2006-07-06 4:45 ` Hans Reiser
2006-06-30 1:50 ` Andrew Morton
2006-07-04 11:12 ` Vladimir V. Saveliev
2006-07-04 11:21 ` Vladimir V. Saveliev
2006-07-04 12:07 ` Pekka Enberg
2006-07-04 18:43 ` Hans Reiser
2006-07-04 11:48 ` Christoph Hellwig
2006-07-04 17:44 ` Hans Reiser
2006-07-04 22:18 ` Andrew Morton
2006-07-04 22:25 ` Hans Reiser
2006-07-04 22:39 ` Hans Reiser
2006-07-05 16:45 ` Vladimir V. Saveliev
2006-07-05 19:26 ` Andrew Morton
2006-07-06 4:58 ` Hans Reiser
2006-07-10 21:12 ` Andreas Dilger
2006-07-09 20:11 ` Christoph Hellwig
2006-07-05 5:06 ` Alexander Zarochentsev
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox