From: Dmitry <dmonakhov@openvz.org>
To: Theodore Ts'o <tytso@mit.edu>, linux-ext4@vger.kernel.org
Cc: akpm@linux-foundation.org, Theodore Ts'o <tytso@mit.edu>
Subject: Re: [PATCH -v2 6/6] ext4: use bio layer instead of buffer layer in mpage_da_submit_io
Date: Mon, 25 Oct 2010 09:16:16 +0400 [thread overview]
Message-ID: <87tykavom7.fsf@dmon-lap.sw.ru> (raw)
In-Reply-To: <1287866420-23762-7-git-send-email-tytso@mit.edu>
On Sat, 23 Oct 2010 16:40:20 -0400, Theodore Ts'o <tytso@mit.edu> wrote:
> Call the block I/O layer directly instad of going through the buffer
> layer. This should give us much better performance and scalability,
> as well as lowering our CPU utilization when doing buffered writeback.
>
> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
> ---
> fs/ext4/Makefile | 2 +-
> fs/ext4/ext4.h | 36 +++++-
> fs/ext4/extents.c | 4 +-
> fs/ext4/inode.c | 118 ++-------------
> fs/ext4/page-io.c | 426 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/ext4/super.c | 8 +-
> 6 files changed, 485 insertions(+), 109 deletions(-)
> create mode 100644 fs/ext4/page-io.c
>
> diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
> index 8867b2a..c947e36 100644
> --- a/fs/ext4/Makefile
> +++ b/fs/ext4/Makefile
> @@ -4,7 +4,7 @@
>
> obj-$(CONFIG_EXT4_FS) += ext4.o
>
> -ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
> +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
> ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
> ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 2283369..3d1abd0 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -168,7 +168,20 @@ struct mpage_da_data {
> int pages_written;
> int retval;
> };
> -#define EXT4_IO_UNWRITTEN 0x1
> +
> +/*
> + * Flags for ext4_io_end->flags
> + */
> +#define EXT4_IO_END_UNWRITTEN 0x0001
> +#define EXT4_IO_END_ERROR 0x0002
> +
> +struct ext4_io_page {
> + struct page *p_page;
> + int p_count;
> +};
> +
> +#define MAX_IO_PAGES 128
> +
> typedef struct ext4_io_end {
> struct list_head list; /* per-file finished IO list */
> struct inode *inode; /* file being written to */
> @@ -179,8 +192,18 @@ typedef struct ext4_io_end {
> struct work_struct work; /* data work queue */
> struct kiocb *iocb; /* iocb struct for AIO */
> int result; /* error value for AIO */
> + int num_io_pages;
> + struct ext4_io_page *pages[MAX_IO_PAGES];
> } ext4_io_end_t;
>
> +struct ext4_io_submit {
> + int io_op;
> + struct bio *io_bio;
> + ext4_io_end_t *io_end;
> + struct ext4_io_page *io_page;
> + sector_t io_next_block;
> +};
> +
> /*
> * Special inodes numbers
> */
> @@ -2044,6 +2067,17 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
> __u64 start_orig, __u64 start_donor,
> __u64 len, __u64 *moved_len);
>
> +/* page_io.c */
> +extern int __init init_ext4_pageio(void);
> +extern void exit_ext4_pageio(void);
> +extern void ext4_free_io_end(ext4_io_end_t *io);
> +extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
> +extern int ext4_end_io_nolock(ext4_io_end_t *io);
> +extern void ext4_io_submit(struct ext4_io_submit *io);
> +extern int ext4_bio_write_page(struct ext4_io_submit *io,
> + struct page *page,
> + int len,
> + struct writeback_control *wbc);
>
> /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
> enum ext4_state_bits {
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index a0e6230..a1e20c8 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -3202,7 +3202,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
> * completed
> */
> if (io)
> - io->flag = EXT4_IO_UNWRITTEN;
> + io->flag = EXT4_IO_END_UNWRITTEN;
> else
> ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
> if (ext4_should_dioread_nolock(inode))
> @@ -3494,7 +3494,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
> */
> if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
> if (io)
> - io->flag = EXT4_IO_UNWRITTEN;
> + io->flag = EXT4_IO_END_UNWRITTEN;
> else
> ext4_set_inode_state(inode,
> EXT4_STATE_DIO_UNWRITTEN);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c65d647..58604fe 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2016,8 +2016,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
> struct buffer_head *bh, *page_bufs = NULL;
> int journal_data = ext4_should_journal_data(inode);
> sector_t pblock = 0, cur_logical = 0;
> + struct ext4_io_submit io_submit;
>
> BUG_ON(mpd->next_page <= mpd->first_page);
> + memset(&io_submit, 0, sizeof(io_submit));
> /*
> * We need to start from the first_page to the next_page - 1
> * to make sure we also write the mapped dirty buffer_heads.
> @@ -2109,16 +2111,16 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
> /* mark the buffer_heads as dirty & uptodate */
> block_commit_write(page, 0, len);
>
> - if (journal_data && PageChecked(page))
> + /*
> + * Delalloc doesn't support data journalling,
> + * but eventually maybe we'll lift this
> + * restriction.
> + */
> + if (unlikely(journal_data && PageChecked(page)))
> err = __ext4_journalled_writepage(page, len);
> - else if (buffer_uninit(page_bufs)) {
> - ext4_set_bh_endio(page_bufs, inode);
> - err = block_write_full_page_endio(page,
> - noalloc_get_block_write,
> - mpd->wbc, ext4_end_io_buffer_write);
> - } else
> - err = block_write_full_page(page,
> - noalloc_get_block_write, mpd->wbc);
> + else
> + err = ext4_bio_write_page(&io_submit, page,
> + len, mpd->wbc);
>
> if (!err)
> mpd->pages_written++;
> @@ -2131,6 +2133,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
> }
> pagevec_release(&pvec);
> }
> + ext4_io_submit(&io_submit);
> return ret;
> }
>
> @@ -3426,15 +3429,6 @@ ext4_readpages(struct file *file, struct address_space *mapping,
> return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
> }
>
> -static void ext4_free_io_end(ext4_io_end_t *io)
> -{
> - BUG_ON(!io);
> - if (io->page)
> - put_page(io->page);
> - iput(io->inode);
> - kfree(io);
> -}
> -
> static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
> {
> struct buffer_head *head, *bh;
> @@ -3640,68 +3634,6 @@ static void dump_completed_IO(struct inode * inode)
> }
>
> /*
> - * check a range of space and convert unwritten extents to written.
> - */
> -static int ext4_end_io_nolock(ext4_io_end_t *io)
> -{
> - struct inode *inode = io->inode;
> - loff_t offset = io->offset;
> - ssize_t size = io->size;
> - int ret = 0;
> -
> - ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
> - "list->prev 0x%p\n",
> - io, inode->i_ino, io->list.next, io->list.prev);
> -
> - if (list_empty(&io->list))
> - return ret;
> -
> - if (io->flag != EXT4_IO_UNWRITTEN)
> - return ret;
> -
> - ret = ext4_convert_unwritten_extents(inode, offset, size);
> - if (ret < 0) {
> - printk(KERN_EMERG "%s: failed to convert unwritten"
> - "extents to written extents, error is %d"
> - " io is still on inode %lu aio dio list\n",
> - __func__, ret, inode->i_ino);
> - return ret;
> - }
> -
> - if (io->iocb)
> - aio_complete(io->iocb, io->result, 0);
> - /* clear the DIO AIO unwritten flag */
> - io->flag = 0;
> - return ret;
> -}
> -
> -/*
> - * work on completed aio dio IO, to convert unwritten extents to extents
> - */
> -static void ext4_end_io_work(struct work_struct *work)
> -{
> - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> - struct inode *inode = io->inode;
> - struct ext4_inode_info *ei = EXT4_I(inode);
> - unsigned long flags;
> - int ret;
> -
> - mutex_lock(&inode->i_mutex);
> - ret = ext4_end_io_nolock(io);
> - if (ret < 0) {
> - mutex_unlock(&inode->i_mutex);
> - return;
> - }
> -
> - spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> - if (!list_empty(&io->list))
> - list_del_init(&io->list);
> - spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> - mutex_unlock(&inode->i_mutex);
> - ext4_free_io_end(io);
> -}
> -
> -/*
> * This function is called from ext4_sync_file().
> *
> * When IO is completed, the work to convert unwritten extents to
> @@ -3756,28 +3688,6 @@ int flush_completed_IO(struct inode *inode)
> return (ret2 < 0) ? ret2 : 0;
> }
>
> -static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
> -{
> - ext4_io_end_t *io = NULL;
> -
> - io = kmalloc(sizeof(*io), flags);
> -
> - if (io) {
> - igrab(inode);
> - io->inode = inode;
> - io->flag = 0;
> - io->offset = 0;
> - io->size = 0;
> - io->page = NULL;
> - io->iocb = NULL;
> - io->result = 0;
> - INIT_WORK(&io->work, ext4_end_io_work);
> - INIT_LIST_HEAD(&io->list);
> - }
> -
> - return io;
> -}
> -
> static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> ssize_t size, void *private, int ret,
> bool is_async)
> @@ -3797,7 +3707,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
> size);
>
> /* if not aio dio with unwritten extents, just free io and return */
> - if (io_end->flag != EXT4_IO_UNWRITTEN){
> + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
> ext4_free_io_end(io_end);
> iocb->private = NULL;
> out:
> @@ -3842,7 +3752,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
> goto out;
> }
>
> - io_end->flag = EXT4_IO_UNWRITTEN;
> + io_end->flag = EXT4_IO_END_UNWRITTEN;
> inode = io_end->inode;
>
> /* Add the io_end to per-inode completed io list*/
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> new file mode 100644
> index 0000000..ec92e38
> --- /dev/null
> +++ b/fs/ext4/page-io.c
> @@ -0,0 +1,426 @@
> +/*
> + * linux/fs/ext4/page-io.c
> + *
> + * This contains the new page_io functions for ext4
> + *
> + * Written by Theodore Ts'o, 2010.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/fs.h>
> +#include <linux/time.h>
> +#include <linux/jbd2.h>
> +#include <linux/highuid.h>
> +#include <linux/pagemap.h>
> +#include <linux/quotaops.h>
> +#include <linux/string.h>
> +#include <linux/buffer_head.h>
> +#include <linux/writeback.h>
> +#include <linux/pagevec.h>
> +#include <linux/mpage.h>
> +#include <linux/namei.h>
> +#include <linux/uio.h>
> +#include <linux/bio.h>
> +#include <linux/workqueue.h>
> +#include <linux/kernel.h>
> +#include <linux/slab.h>
> +
> +#include "ext4_jbd2.h"
> +#include "xattr.h"
> +#include "acl.h"
> +#include "ext4_extents.h"
> +
> +static struct kmem_cache *io_page_cachep, *io_end_cachep;
> +
> +int __init init_ext4_pageio(void)
> +{
> + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
> + if (io_page_cachep == NULL)
> + return -ENOMEM;
> + io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
> + if (io_page_cachep == NULL) {
> + kmem_cache_destroy(io_page_cachep);
> + return -ENOMEM;
> + }
> +
> + return 0;
> +}
> +
> +void exit_ext4_pageio(void)
> +{
> + kmem_cache_destroy(io_end_cachep);
> + kmem_cache_destroy(io_page_cachep);
> +}
> +
> +void ext4_free_io_end(ext4_io_end_t *io)
> +{
> + int i;
> +
> + BUG_ON(!io);
> + if (io->page)
> + put_page(io->page);
> + for (i = 0; i < io->num_io_pages; i++) {
> + if (--io->pages[i]->p_count == 0) {
> + struct page *page = io->pages[i]->p_page;
> +
> + end_page_writeback(page);
> + put_page(page);
> + kmem_cache_free(io_page_cachep, io->pages[i]);
> + }
> + }
> + io->num_io_pages = 0;
> + iput(io->inode);
> + kmem_cache_free(io_end_cachep, io);
> +}
> +
> +/*
> + * check a range of space and convert unwritten extents to written.
> + */
> +int ext4_end_io_nolock(ext4_io_end_t *io)
> +{
> + struct inode *inode = io->inode;
> + loff_t offset = io->offset;
> + ssize_t size = io->size;
> + int ret = 0;
> +
> + ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
> + "list->prev 0x%p\n",
> + io, inode->i_ino, io->list.next, io->list.prev);
> +
> + if (list_empty(&io->list))
> + return ret;
> +
> + if (!(io->flag & EXT4_IO_END_UNWRITTEN))
> + return ret;
> +
> + ret = ext4_convert_unwritten_extents(inode, offset, size);
> + if (ret < 0) {
> + printk(KERN_EMERG "%s: failed to convert unwritten "
> + "extents to written extents, error is %d "
> + "io is still on inode %lu aio dio list\n",
> + __func__, ret, inode->i_ino);
> + return ret;
> + }
> +
> + if (io->iocb)
> + aio_complete(io->iocb, io->result, 0);
> + /* clear the DIO AIO unwritten flag */
> + io->flag &= ~EXT4_IO_END_UNWRITTEN;
> + return ret;
> +}
> +
> +/*
> + * work on completed aio dio IO, to convert unwritten extents to extents
> + */
> +static void ext4_end_io_work(struct work_struct *work)
> +{
> + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
> + struct inode *inode = io->inode;
> + struct ext4_inode_info *ei = EXT4_I(inode);
> + unsigned long flags;
> + int ret;
> +
> + mutex_lock(&inode->i_mutex);
> + ret = ext4_end_io_nolock(io);
> + if (ret < 0) {
> + mutex_unlock(&inode->i_mutex);
> + return;
> + }
> +
> + spin_lock_irqsave(&ei->i_completed_io_lock, flags);
> + if (!list_empty(&io->list))
> + list_del_init(&io->list);
> + spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
> + mutex_unlock(&inode->i_mutex);
> + ext4_free_io_end(io);
> +}
> +
> +ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
> +{
> + ext4_io_end_t *io = NULL;
> +
> + io = kmem_cache_alloc(io_end_cachep, flags);
> + if (io) {
> + memset(io, 0, sizeof(*io));
> + io->inode = igrab(inode);
> + BUG_ON(!io->inode);
> + INIT_WORK(&io->work, ext4_end_io_work);
> + INIT_LIST_HEAD(&io->list);
> + }
> + return io;
> +}
> +
> +/*
> + * Print an buffer I/O error compatible with the fs/buffer.c. This
> + * provides compatibility with dmesg scrapers that look for a specific
> + * buffer I/O error message. We really need a unified error reporting
> + * structure to userspace ala Digital Unix's uerf system, but it's
> + * probably not going to happen in my lifetime, due to LKML politics...
> + */
> +static void buffer_io_error(struct buffer_head *bh)
> +{
> + char b[BDEVNAME_SIZE];
> + printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
> + bdevname(bh->b_bdev, b),
> + (unsigned long long)bh->b_blocknr);
> +}
> +
> +static void ext4_end_bio(struct bio *bio, int error)
> +{
> + ext4_io_end_t *io_end = bio->bi_private;
> + struct workqueue_struct *wq;
> + struct inode *inode;
> + unsigned long flags;
> + int i;
> +
> + BUG_ON(!io_end);
> + inode = io_end->inode;
> + bio->bi_private = NULL;
> + bio->bi_end_io = NULL;
> + if (test_bit(BIO_UPTODATE, &bio->bi_flags))
> + error = 0;
> + bio_put(bio);
> +
> + if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
> + pr_err("sb umounted, discard end_io request for inode %lu\n",
> + io_end->inode->i_ino);
> + ext4_free_io_end(io_end);
> + return;
> + }
> +
> + if (error) {
> + io_end->flag |= EXT4_IO_END_ERROR;
> + ext4_warning(inode->i_sb, "I/O error writing inode %lu "
> + "(offset %llu size %ld)", inode->i_ino,
> + (unsigned long long) io_end->offset,
> + (long) io_end->size);
> + }
> +
> + for (i = 0; i < io_end->num_io_pages; i++) {
> + struct page *page = io_end->pages[i]->p_page;
> + struct buffer_head *bh, *head;
> + int partial_write = 0;
> +
> + head = page_buffers(page);
> + if (error)
> + SetPageError(page);
> + BUG_ON(!head);
> + if (head->b_size == PAGE_CACHE_SIZE)
> + clear_buffer_dirty(head);
> + else {
> + loff_t offset;
> + loff_t io_end_offset = io_end->offset + io_end->size;
> +
> + offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
> + bh = head;
> + do {
> + if ((offset >= io_end->offset) &&
> + (offset+bh->b_size <= io_end_offset)) {
> + if (error)
> + buffer_io_error(bh);
> +
> + clear_buffer_dirty(bh);
> + }
> + if (buffer_delay(bh))
> + partial_write = 1;
> + else if (!buffer_mapped(bh))
> + clear_buffer_dirty(bh);
> + else if (buffer_dirty(bh))
> + partial_write = 1;
> + offset += bh->b_size;
> + bh = bh->b_this_page;
> + } while (bh != head);
> + }
> +
> + if (--io_end->pages[i]->p_count == 0) {
> + struct page *page = io_end->pages[i]->p_page;
> +
> + end_page_writeback(page);
> + put_page(page);
> + kmem_cache_free(io_page_cachep, io_end->pages[i]);
> + }
> +
> + /*
> + * If this is a partial write which happened to make
> + * all buffers uptodate then we can optimize away a
> + * bogus readpage() for the next read(). Here we
> + * 'discover' whether the page went uptodate as a
> + * result of this (potentially partial) write.
> + */
> + if (!partial_write)
> + SetPageUptodate(page);
> + }
> +
> + io_end->num_io_pages = 0;
> +
> + /* Add the io_end to per-inode completed io list*/
> + spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
> + list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
> + spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
> +
> + wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
> + /* queue the work to convert unwritten extents to written */
> + queue_work(wq, &io_end->work);
> +}
> +
> +void ext4_io_submit(struct ext4_io_submit *io)
> +{
> + struct bio *bio = io->io_bio;
> +
> + if (bio) {
> + bio_get(io->io_bio);
> + submit_bio(io->io_op, io->io_bio);
> + BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
Definitly this BUG_ON should be converted to ext4_error or something
similar, otherwhise writeback attempt to removed usb-stick will be fatal
for a whole system. IMHO it is reasonable to skip this check at all,
because all work will be done in ext4_end_bio() anyway.
> + bio_put(io->io_bio);
> + }
> + io->io_bio = 0;
> + io->io_op = 0;
> + io->io_end = 0;
> +}
> +
> +static int io_submit_init(struct ext4_io_submit *io,
> + struct inode *inode,
> + struct writeback_control *wbc,
> + struct buffer_head *bh)
> +{
> + ext4_io_end_t *io_end;
> + struct page *page = bh->b_page;
> + int nvecs = bio_get_nr_vecs(bh->b_bdev);
> + struct bio *bio;
> +
> + io_end = ext4_init_io_end(inode, GFP_NOFS);
> + if (!io_end)
> + return -ENOMEM;
> + do {
> + bio = bio_alloc(GFP_NOIO, nvecs);
> + nvecs >>= 1;
> + } while (bio == NULL);
> +
> + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
> + bio->bi_bdev = bh->b_bdev;
> + bio->bi_private = io->io_end = io_end;
> + bio->bi_end_io = ext4_end_bio;
> +
> + io_end->inode = inode;
> + io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
> +
> + io->io_bio = bio;
> + io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
> + WRITE_SYNC_PLUG : WRITE);
> + io->io_next_block = bh->b_blocknr;
> + return 0;
> +}
> +
> +static int io_submit_add_bh(struct ext4_io_submit *io,
> + struct ext4_io_page *io_page,
> + struct inode *inode,
> + struct writeback_control *wbc,
> + struct buffer_head *bh)
> +{
> + ext4_io_end_t *io_end;
> + int ret;
> +
> + if (buffer_new(bh)) {
> + clear_buffer_new(bh);
> + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
> + }
> +
> + if (!buffer_mapped(bh) || buffer_delay(bh)) {
> + if (!buffer_mapped(bh))
> + clear_buffer_dirty(bh);
> + if (io->io_bio)
> + ext4_io_submit(io);
> + return 0;
> + }
> +
> + if (io->io_bio && bh->b_blocknr != io->io_next_block) {
> +submit_and_retry:
> + ext4_io_submit(io);
> + }
> + if (io->io_bio == NULL) {
> + ret = io_submit_init(io, inode, wbc, bh);
> + if (ret)
> + return ret;
> + }
> + io_end = io->io_end;
> + if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
> + (io_end->pages[io_end->num_io_pages-1] != io_page))
> + goto submit_and_retry;
> + if (buffer_uninit(bh))
> + io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
> + io->io_end->size += bh->b_size;
> + io->io_next_block++;
> + ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
> + if (ret != bh->b_size)
> + goto submit_and_retry;
> + if ((io_end->num_io_pages == 0) ||
> + (io_end->pages[io_end->num_io_pages-1] != io_page)) {
> + io_end->pages[io_end->num_io_pages++] = io_page;
> + io_page->p_count++;
> + }
> + return 0;
> +}
> +
> +int ext4_bio_write_page(struct ext4_io_submit *io,
> + struct page *page,
> + int len,
> + struct writeback_control *wbc)
> +{
> + struct inode *inode = page->mapping->host;
> + unsigned block_start, block_end, blocksize;
> + struct ext4_io_page *io_page;
> + struct buffer_head *bh, *head;
> + int ret = 0;
> +
> + blocksize = 1 << inode->i_blkbits;
> +
> + BUG_ON(PageWriteback(page));
> + set_page_writeback(page);
> + ClearPageError(page);
> +
> + io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
> + if (!io_page) {
> + set_page_dirty(page);
> + unlock_page(page);
> + return -ENOMEM;
> + }
> + io_page->p_page = page;
> + io_page->p_count = 0;
> + get_page(page);
> +
> + for (bh = head = page_buffers(page), block_start = 0;
> + bh != head || !block_start;
> + block_start = block_end, bh = bh->b_this_page) {
> + block_end = block_start + blocksize;
> + if (block_start >= len) {
> + clear_buffer_dirty(bh);
> + set_buffer_uptodate(bh);
> + continue;
> + }
> + ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
> + if (ret) {
> + /*
> + * We only get here on ENOMEM. Not much else
> + * we can do but mark the page as dirty, and
> + * better luck next time.
> + */
> + set_page_dirty(page);
> + break;
> + }
> + }
> + unlock_page(page);
> + /*
> + * If the page was truncated before we could do the writeback,
> + * or we had a memory allocation error while trying to write
> + * the first buffer head, we won't have submitted any pages for
> + * I/O. In that case we need to make sure we've cleared the
> + * PageWriteback bit from the page to prevent the system from
> + * wedging later on.
> + */
> + if (io_page->p_count == 0) {
> + put_page(page);
> + end_page_writeback(page);
> + kmem_cache_free(io_page_cachep, io_page);
> + }
> + return ret;
> +}
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 16002ec..9f602c2 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -4768,9 +4768,12 @@ static int __init init_ext4_fs(void)
> int err;
>
> ext4_check_flag_values();
> - err = init_ext4_system_zone();
> + err = init_ext4_pageio();
> if (err)
> return err;
> + err = init_ext4_system_zone();
> + if (err)
> + goto out5;
> ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
> if (!ext4_kset)
> goto out4;
> @@ -4811,6 +4814,8 @@ out3:
> kset_unregister(ext4_kset);
> out4:
> exit_ext4_system_zone();
> +out5:
> + exit_ext4_pageio();
> return err;
> }
>
> @@ -4826,6 +4831,7 @@ static void __exit exit_ext4_fs(void)
> remove_proc_entry("fs/ext4", NULL);
> kset_unregister(ext4_kset);
> exit_ext4_system_zone();
> + exit_ext4_pageio();
> }
>
> MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
> --
> 1.7.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2010-10-25 5:16 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-10-23 20:40 [PATCH -v2 0/6] ext4: use the bio layer directly Theodore Ts'o
2010-10-23 20:40 ` [PATCH -v2 1/6] ext4: call mpage_da_submit_io() from mpage_da_map_blocks() Theodore Ts'o
2010-10-23 20:40 ` [PATCH -v2 2/6] ext4: simplify ext4_writepage() Theodore Ts'o
2010-10-23 20:40 ` [PATCH -v2 3/6] ext4: inline ext4_writepage() into mpage_da_submit_io() Theodore Ts'o
2010-10-23 20:40 ` [PATCH -v2 4/6] ext4: inline walk_page_buffers() into mpage_da_submit_io Theodore Ts'o
2010-10-23 20:40 ` [PATCH -v2 5/6] ext4: move mpage_put_bnr_to_bhs()'s functionality to mpage_da_submit_io() Theodore Ts'o
2010-10-23 20:40 ` [PATCH -v2 6/6] ext4: use bio layer instead of buffer layer in mpage_da_submit_io Theodore Ts'o
2010-10-25 5:16 ` Dmitry [this message]
2010-10-25 12:33 ` Ted Ts'o
2010-10-25 13:05 ` Dmitry
2010-10-23 23:03 ` [PATCH -v2 0/6] ext4: use the bio layer directly Ted Ts'o
2010-10-30 19:10 ` Eric Whitney
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87tykavom7.fsf@dmon-lap.sw.ru \
--to=dmonakhov@openvz.org \
--cc=akpm@linux-foundation.org \
--cc=linux-ext4@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.