From: "Vladimir V. Saveliev" <vs@namesys.com>
To: Andrew Morton <akpm@osdl.org>
Cc: Hans Reiser <reiser@namesys.com>,
dgc@sgi.com, hch@infradead.org, Reiserfs-Dev@namesys.com,
Linux-Kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org
Subject: Re: batched write
Date: Tue, 20 Jun 2006 20:26:44 +0400 [thread overview]
Message-ID: <1150820804.6447.45.camel@tribesman.namesys.com> (raw)
In-Reply-To: <20060620002659.08aee963.akpm@osdl.org>
[-- Attachment #1: Type: text/plain, Size: 386 bytes --]
Hello
On Tue, 2006-06-20 at 00:26 -0700, Andrew Morton wrote:
> On Tue, 20 Jun 2006 00:19:24 -0700
> Hans Reiser <reiser@namesys.com> wrote:
>
> > So far we have XFS, FUSE, and reiser4 benefiting from the potential
> > ability to process more than 4k at a time. Is it enough?
>
> Spose so. Let's see what the diff looks like?
>
ok, the first draft version for evaluation is here.
[-- Attachment #2: batched-write.patch --]
[-- Type: text/x-patch, Size: 8721 bytes --]
This patch adds a method fill_pages to struct address_space_operations.
A filesystem may want to implement this operation to improve write performance.
Generic implementation for the method is made by cut-n-paste off generic_file_buffered_write:
it writes one page using prepare_write and commit_write address space operations.
NOTE: just for evaluation, compiliable only yet.
diff -puN mm/filemap.c~batched-write mm/filemap.c
--- linux-2.6.17-rc6-mm2/mm/filemap.c~batched-write 2006-06-12 16:06:43.000000000 +0400
+++ linux-2.6.17-rc6-mm2-vs/mm/filemap.c 2006-06-20 19:52:53.000000000 +0400
@@ -2093,68 +2093,78 @@ generic_file_direct_write(struct kiocb *
}
EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+typedef size_t (*write_actor_t)(struct page *, unsigned long, size_t,
+ const write_descriptor_t *);
+struct write_descriptor {
+ loff_t pos;
+ size_t count;
+ const struct iovec *cur_iov; /* current iovec */
+ size_t iov_base; /* offset in the current iovec */
+ char __user *buf;
+ write_actor_t actor;
+};
+
+static size_t write_actor(struct page *page, unsigned long offset, size_t bytes,
+ const write_descriptor_t *desc)
{
- struct file *file = iocb->ki_filp;
- struct address_space * mapping = file->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- struct page *page;
- struct page *cached_page = NULL;
- size_t bytes;
- struct pagevec lru_pvec;
- const struct iovec *cur_iov = iov; /* current iovec */
- size_t iov_base = 0; /* offset in the current iovec */
- char __user *buf;
+ return filemap_copy_from_user(page, offset, desc->buf, bytes);
+}
- pagevec_init(&lru_pvec, 0);
+static size_t write_iovec_actor(struct page *page, unsigned long offset,
+ size_t bytes, const write_descriptor_t *desc)
+{
+ return filemap_copy_from_user_iovec(page, offset, desc->cur_iov,
+ desc->iov_base, bytes);
+}
+
+/**
+ * generic_fill_pages
+ * @file:
+ * @desc:
+ * @lru_pvec:
+ * @cached_page:
+ * @copied:
+ *
+ * Number of bytes written is returned via @copied.
+ */
+long generic_fill_pages(struct file *file, const write_descriptor_t *desc,
+ struct pagevec *lru_pvec, struct page **cached_page,
+ size_t *copied)
+{
+ const struct address_space_operations *a_ops = file->f_mapping->a_ops;
+ struct page *page;
+ unsigned long index;
+ size_t bytes;
+ unsigned long offset;
+ unsigned long maxlen;
+ long status;
+
+ offset = (desc->pos & (PAGE_CACHE_SIZE - 1)); /* Within page */
+ index = desc->pos >> PAGE_CACHE_SHIFT;
+ bytes = PAGE_CACHE_SIZE - offset;
+ if (bytes > desc->count)
+ bytes = desc->count;
/*
- * handle partial DIO write. Adjust cur_iov if needed.
+ * Bring in the user page that we will copy from _first_.
+ * Otherwise there's a nasty deadlock on copying from the same
+ * page as we're writing to, without it being marked
+ * up-to-date.
*/
- if (likely(nr_segs == 1))
- buf = iov->iov_base + written;
- else {
- filemap_set_next_iovec(&cur_iov, &iov_base, written);
- buf = cur_iov->iov_base + iov_base;
- }
-
- do {
- unsigned long index;
- unsigned long offset;
- unsigned long maxlen;
- size_t copied;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- */
- maxlen = cur_iov->iov_len - iov_base;
- if (maxlen > bytes)
- maxlen = bytes;
- fault_in_pages_readable(buf, maxlen);
-
- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
+ maxlen = desc->cur_iov->iov_len - desc->iov_base;
+ if (maxlen > bytes)
+ maxlen = bytes;
+
+ while (1) {
+ fault_in_pages_readable(desc->buf, maxlen);
+
+ page = __grab_cache_page(file->f_mapping, index, cached_page, lru_pvec);
+ if (!page)
+ return -ENOMEM;
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read(file->f_mapping->host);
if (status != AOP_TRUNCATED_PAGE)
unlock_page(page);
@@ -2162,51 +2172,87 @@ generic_file_buffered_write(struct kiocb
if (status == AOP_TRUNCATED_PAGE)
continue;
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * prepare_write() may have instantiated a few
+ * blocks outside i_size. Trim these off
+ * again.
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ if (desc->pos + bytes > isize)
+ vmtruncate(file->f_mapping->host, isize);
+ return status;
}
- if (likely(nr_segs == 1))
- copied = filemap_copy_from_user(page, offset,
- buf, bytes);
- else
- copied = filemap_copy_from_user_iovec(page, offset,
- cur_iov, iov_base, bytes);
+
+ *copied = desc->actor(page, offset, bytes, desc);
+
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (status == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
continue;
}
- if (likely(copied > 0)) {
- if (!status)
- status = copied;
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
- filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
- if (count)
- buf = cur_iov->iov_base +
- iov_base;
- } else {
- iov_base += status;
- }
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
+ break;
+ }
+ if (status)
+ *copied = 0;
+ else if (*copied != bytes)
+ status = -EFAULT;
+ return status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ struct inode *inode = mapping->host;
+ long status;
+ struct page *cached_page = NULL;
+ struct pagevec lru_pvec;
+ write_descriptor_t desc;
+ size_t copied = 0;
+
+ pagevec_init(&lru_pvec, 0);
+
+ desc.pos = pos;
+ desc.count = count;
+ desc.cur_iov = iov;
+ desc.iov_base = 0;
+
+ /*
+ * handle partial DIO write. Adjust cur_iov if needed.
+ */
+ if (likely(nr_segs == 1)) {
+ desc.buf = iov->iov_base + written;
+ desc.actor = write_actor;
+ } else {
+ filemap_set_next_iovec(&desc.cur_iov, &desc.iov_base, written);
+ desc.buf = desc.cur_iov->iov_base + desc.iov_base;
+ desc.actor = write_iovec_actor;
+ }
+
+ do {
+ status = mapping->a_ops->fill_pages(file, &desc,
+ &lru_pvec, &cached_page, &copied);
+ if (likely(copied > 0)) {
+ written += copied;
+ desc.count -= copied;
+ desc.pos += copied;
+ desc.buf += copied;
+ if (unlikely(nr_segs > 1)) {
+ filemap_set_next_iovec(&desc.cur_iov,
+ &desc.iov_base, copied);
+ if (count)
+ desc.buf = desc.cur_iov->iov_base + desc.iov_base;
+ } else {
+ desc.iov_base += copied;
+ }
+ }
if (status < 0)
break;
balance_dirty_pages_ratelimited(mapping);
diff -puN include/linux/fs.h~batched-write include/linux/fs.h
--- linux-2.6.17-rc6-mm2/include/linux/fs.h~batched-write 2006-06-16 19:12:47.000000000 +0400
+++ linux-2.6.17-rc6-mm2-vs/include/linux/fs.h 2006-06-20 19:40:26.000000000 +0400
@@ -346,6 +346,9 @@ enum positive_aop_returns {
struct page;
struct address_space;
struct writeback_control;
+typedef struct write_descriptor write_descriptor_t;
+
+#include <linux/pagevec.h>
struct address_space_operations {
int (*writepage)(struct page *page, struct writeback_control *wbc);
@@ -367,6 +370,9 @@ struct address_space_operations {
*/
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ long (*fill_pages)(struct file *, const write_descriptor_t *,
+ struct pagevec *, struct page **, size_t *copied);
+
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
sector_t (*bmap)(struct address_space *, sector_t);
void (*invalidatepage) (struct page *, unsigned long);
_
next prev parent reply other threads:[~2006-06-20 16:41 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <44736D3E.8090808@namesys.com>
[not found] ` <20060524175312.GA3579@zero>
[not found] ` <44749E24.40203@namesys.com>
[not found] ` <20060608110044.GA5207@suse.de>
[not found] ` <1149766000.6336.29.camel@tribesman.namesys.com>
[not found] ` <20060608121006.GA8474@infradead.org>
2006-06-14 22:08 ` batched write Vladimir V. Saveliev
2006-06-17 17:04 ` Andrew Morton
2006-06-17 17:51 ` Hans Reiser
2006-06-18 11:20 ` Nix
2006-06-19 9:05 ` Hans Reiser
2006-06-19 11:32 ` Miklos Szeredi
2006-06-19 16:39 ` Hans Reiser
2006-06-19 17:35 ` Miklos Szeredi
2006-06-19 17:52 ` Akshat Aranya
2006-06-19 20:39 ` Hans Reiser
2006-06-19 16:27 ` Andreas Dilger
2006-06-19 16:51 ` Hans Reiser
2006-06-19 18:50 ` Andreas Dilger
2006-06-19 20:47 ` Hans Reiser
2006-06-20 0:01 ` David Chinner
2006-06-20 7:19 ` Hans Reiser
2006-06-20 7:26 ` Andrew Morton
2006-06-20 9:02 ` Steven Whitehouse
2006-06-20 16:26 ` Vladimir V. Saveliev [this message]
2006-06-20 17:29 ` Hans Reiser
2006-06-19 18:28 ` Vladimir V. Saveliev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1150820804.6447.45.camel@tribesman.namesys.com \
--to=vs@namesys.com \
--cc=Linux-Kernel@vger.kernel.org \
--cc=Reiserfs-Dev@namesys.com \
--cc=akpm@osdl.org \
--cc=dgc@sgi.com \
--cc=hch@infradead.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=reiser@namesys.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).