linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
@ 2005-10-04 19:42 Christoph Hellwig
  2005-10-04 22:05 ` Dave Kleikamp
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Christoph Hellwig @ 2005-10-04 19:42 UTC (permalink / raw)
  To: linux-fsdevel

This patch changes mpage_readpage/mpage_readpages to use a get_blocks
call that gets the disk mapping information for multiple blocks at the
same time, similar to the way direct I/O code works.  For extent based
filesystems like jfs, xfs or reiser4 this allows to reduce the overhead
of the allocator calls, which is especially nice when that codepath
is rather heavyweight (as it is for example in xfs).  For filesystems
that don't have an allocator that make use of it nothing changes.

I've tested this heavily on XFS with block size = page size and 512 byte
blocks, and it passes the XFS QA regression test suite fine.  I've done
some basic fsx testing on all the other affected filesystems.


Index: linux-2.6/fs/ext3/inode.c
===================================================================
--- linux-2.6.orig/fs/ext3/inode.c	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/fs/ext3/inode.c	2005-10-04 20:47:55.000000000 +0200
@@ -800,6 +800,18 @@
 	return ret;
 }
 
+static int
+ext3_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
+			struct buffer_head *bh_result, int create)
+{
+	int ret;
+
+	ret = ext3_get_block(inode, iblock, bh_result, create);
+	if (ret == 0)
+		bh_result->b_size = (1 << inode->i_blkbits);
+	return ret;
+}
+
 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
 
 static int
@@ -1411,14 +1423,15 @@
 
 static int ext3_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, ext3_get_block);
+	return mpage_readpage(page, ext3_get_blocks, ext3_get_block);
 }
 
 static int
 ext3_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
+	return mpage_readpages(mapping, pages, nr_pages,
+			ext3_get_blocks, ext3_get_block);
 }
 
 static int ext3_invalidatepage(struct page *page, unsigned long offset)
Index: linux-2.6/fs/mpage.c
===================================================================
--- linux-2.6.orig/fs/mpage.c	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/fs/mpage.c	2005-10-04 20:48:54.000000000 +0200
@@ -165,7 +165,9 @@
 
 static struct bio *
 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
-			sector_t *last_block_in_bio, get_block_t get_block)
+			sector_t *last_block_in_bio, struct buffer_head *map_bh,
+			unsigned long *first_logical_block, int *map_valid,
+			get_blocks_t get_blocks, get_block_t get_block)
 {
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
@@ -177,29 +179,63 @@
 	unsigned page_block;
 	unsigned first_hole = blocks_per_page;
 	struct block_device *bdev = NULL;
-	struct buffer_head bh;
 	int length;
 	int fully_mapped = 1;
+	unsigned nblocks, i;
 
 	if (page_has_buffers(page))
 		goto confused;
 
 	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
+	page_block = 0;
+
+	/*
+	 * Map blocks using the result from the last get_blocks call first.
+	 */
+	nblocks = map_bh->b_size >> inode->i_blkbits;
+	if (*map_valid &&
+	    block_in_file > *first_logical_block &&
+	    block_in_file < (*first_logical_block + nblocks)) {
+		unsigned map_offset = block_in_file - *first_logical_block;
+		unsigned last = nblocks - map_offset;
+
+		for (i = 0; ; i++) {
+			if (i == last) {
+				*map_valid = 0;
+				break;
+			} else if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr + map_offset + i;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
+	}
+
+	/*
+	 * Then do more get_blocks calls until we are done with this page.
+	 */
+	map_bh->b_page = page;
+	while (page_block < blocks_per_page) {
+		map_bh->b_state = 0;
+		map_bh->b_size = 0;
 
-	bh.b_page = page;
-	for (page_block = 0; page_block < blocks_per_page;
-				page_block++, block_in_file++) {
-		bh.b_state = 0;
 		if (block_in_file < last_block) {
-			if (get_block(inode, block_in_file, &bh, 0))
+			if (get_blocks(inode, block_in_file,
+				       last_block - block_in_file, map_bh, 0))
 				goto confused;
+			*first_logical_block = block_in_file;
+			*map_valid  = 1;
 		}
 
-		if (!buffer_mapped(&bh)) {
+		if (!buffer_mapped(map_bh)) {
 			fully_mapped = 0;
 			if (first_hole == blocks_per_page)
 				first_hole = page_block;
+			page_block++;
+			block_in_file++;
+			*map_valid = 0;
 			continue;
 		}
 
@@ -209,8 +245,8 @@
 		 * we just collected from get_block into the page's buffers
 		 * so readpage doesn't have to repeat the get_block call
 		 */
-		if (buffer_uptodate(&bh)) {
-			map_buffer_to_page(page, &bh, page_block);
+		if (buffer_uptodate(map_bh)) {
+			map_buffer_to_page(page, map_bh, page_block);
 			goto confused;
 		}
 	
@@ -218,10 +254,20 @@
 			goto confused;		/* hole -> non-hole */
 
 		/* Contiguous blocks? */
-		if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
+		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
 			goto confused;
-		blocks[page_block] = bh.b_blocknr;
-		bdev = bh.b_bdev;
+		nblocks = map_bh->b_size >> inode->i_blkbits;
+		for (i = 0; ; i++) {
+			if (i == nblocks) {
+				*map_valid = 0;
+				break;
+			} else if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr + i;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
 	}
 
 	if (first_hole != blocks_per_page) {
@@ -260,7 +306,7 @@
 		goto alloc_new;
 	}
 
-	if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
+	if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
 		bio = mpage_bio_submit(READ, bio);
 	else
 		*last_block_in_bio = blocks[blocks_per_page - 1];
@@ -325,12 +371,16 @@
  */
 int
 mpage_readpages(struct address_space *mapping, struct list_head *pages,
-				unsigned nr_pages, get_block_t get_block)
+				unsigned nr_pages, get_blocks_t get_blocks,
+				get_block_t get_block)
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	sector_t last_block_in_bio = 0;
 	struct pagevec lru_pvec;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+	int map_valid = 0;
 
 	pagevec_init(&lru_pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -342,7 +392,9 @@
 					page->index, GFP_KERNEL)) {
 			bio = do_mpage_readpage(bio, page,
 					nr_pages - page_idx,
-					&last_block_in_bio, get_block);
+					&last_block_in_bio, &map_bh,
+					&first_logical_block, &map_valid,
+					get_blocks, get_block);
 			if (!pagevec_add(&lru_pvec, page))
 				__pagevec_lru_add(&lru_pvec);
 		} else {
@@ -360,13 +412,18 @@
 /*
  * This isn't called much at all
  */
-int mpage_readpage(struct page *page, get_block_t get_block)
+int mpage_readpage(struct page *page, get_blocks_t get_blocks,
+		get_block_t get_block)
 {
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+	int map_valid = 0;
 
-	bio = do_mpage_readpage(bio, page, 1,
-			&last_block_in_bio, get_block);
+	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
+			&map_bh, &first_logical_block, &map_valid,
+			get_blocks, get_block);
 	if (bio)
 		mpage_bio_submit(READ, bio);
 	return 0;
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c	2005-10-04 19:52:49.000000000 +0200
@@ -1034,6 +1034,18 @@
 }
 
 STATIC int
+linvfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	unsigned long		max_blocks,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
+					create, 0, BMAPI_WRITE);
+}
+
+STATIC int
 linvfs_get_blocks_direct(
 	struct inode		*inode,
 	sector_t		iblock,
@@ -1139,7 +1151,7 @@
 	struct file		*unused,
 	struct page		*page)
 {
-	return mpage_readpage(page, linvfs_get_block);
+	return mpage_readpage(page, linvfs_get_blocks, linvfs_get_block);
 }
 
 STATIC int
@@ -1149,7 +1161,8 @@
 	struct list_head	*pages,
 	unsigned		nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
+	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_blocks,
+			linvfs_get_block);
 }
 
 STATIC void
Index: linux-2.6/include/linux/mpage.h
===================================================================
--- linux-2.6.orig/include/linux/mpage.h	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/include/linux/mpage.h	2005-10-04 19:52:49.000000000 +0200
@@ -14,8 +14,9 @@
 typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
 
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
-				unsigned nr_pages, get_block_t get_block);
-int mpage_readpage(struct page *page, get_block_t get_block);
+				unsigned nr_pages, get_blocks_t get_blocks,
+				get_block_t get_block);
+int mpage_readpage(struct page *page, get_blocks_t get_blocks, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c	2005-10-04 20:47:05.000000000 +0200
+++ linux-2.6/fs/ext2/inode.c	2005-10-04 21:22:43.000000000 +0200
@@ -621,6 +621,18 @@
 	goto reread;
 }
 
+static int
+ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
+			struct buffer_head *bh_result, int create)
+{
+	int ret;
+
+	ret = ext2_get_block(inode, iblock, bh_result, create);
+	if (ret == 0)
+		bh_result->b_size = (1 << inode->i_blkbits);
+	return ret;
+}
+
 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, ext2_get_block, wbc);
@@ -628,14 +640,15 @@
 
 static int ext2_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, ext2_get_block);
+	return mpage_readpage(page, ext2_get_blocks, ext2_get_block);
 }
 
 static int
 ext2_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
+	return mpage_readpages(mapping, pages, nr_pages,
+			ext2_get_blocks, ext2_get_block);
 }
 
 static int
@@ -663,18 +676,6 @@
 	return generic_block_bmap(mapping,block,ext2_get_block);
 }
 
-static int
-ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
-			struct buffer_head *bh_result, int create)
-{
-	int ret;
-
-	ret = ext2_get_block(inode, iblock, bh_result, create);
-	if (ret == 0)
-		bh_result->b_size = (1 << inode->i_blkbits);
-	return ret;
-}
-
 static ssize_t
 ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
Index: linux-2.6/fs/jfs/inode.c
===================================================================
--- linux-2.6.orig/fs/jfs/inode.c	2005-10-04 20:47:05.000000000 +0200
+++ linux-2.6/fs/jfs/inode.c	2005-10-04 20:47:31.000000000 +0200
@@ -273,13 +273,14 @@
 
 static int jfs_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, jfs_get_block);
+	return mpage_readpage(page, jfs_get_blocks, jfs_get_block);
 }
 
 static int jfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
+	return mpage_readpages(mapping, pages, nr_pages, jfs_get_blocks,
+			jfs_get_block);
 }
 
 static int jfs_prepare_write(struct file *file,
Index: linux-2.6/fs/reiserfs/inode.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/inode.c	2005-10-04 20:47:05.000000000 +0200
+++ linux-2.6/fs/reiserfs/inode.c	2005-10-04 20:47:31.000000000 +0200
@@ -1049,10 +1049,24 @@
 }
 
 static int
+reiserfs_get_blocks(struct inode *inode, sector_t iblock,
+		unsigned long max_blocks, struct buffer_head *bh_result,
+		int create)
+{
+	int ret;
+
+	ret = reiserfs_get_block(inode, iblock, bh_result, create);
+	if (ret == 0)
+		bh_result->b_size = (1 << inode->i_blkbits);
+	return ret;
+}
+
+static int
 reiserfs_readpages(struct file *file, struct address_space *mapping,
 		   struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_blocks,
+			reiserfs_get_block);
 }
 
 /* Compute real number of used bytes by file

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-04 19:42 [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s) Christoph Hellwig
@ 2005-10-04 22:05 ` Dave Kleikamp
  2005-10-04 23:15 ` Badari Pulavarty
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Dave Kleikamp @ 2005-10-04 22:05 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel

On Tue, 2005-10-04 at 21:42 +0200, Christoph Hellwig wrote:
> This patch changes mpage_readpage/mpage_readpages to use a get_blocks
> call that gets the disk mapping information for multiple blocks at the
> same time, similar to the way direct I/O code works.  For extent based
> filesystems like jfs, xfs or reiser4 this allows to reduce the overhead
> of the allocator calls, which is especially nice when that codepath
> is rather heavyweight (as it is for example in xfs).  For filesystems
> that don't have an allocator that make use of it nothing changes.
> 
> I've tested this heavily on XFS with block size = page size and 512 byte
> blocks, and it passes the XFS QA regression test suite fine.  I've done
> some basic fsx testing on all the other affected filesystems.

This is looking good for jfs.  Feel free to add an ACK or a
signed-off-by from me.

Thanks,
Shaggy
-- 
David Kleikamp
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-04 19:42 [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s) Christoph Hellwig
  2005-10-04 22:05 ` Dave Kleikamp
@ 2005-10-04 23:15 ` Badari Pulavarty
  2005-10-05  7:14   ` Christoph Hellwig
  2005-10-05  6:04 ` Nikita Danilov
  2005-10-19 11:56 ` Christoph Hellwig
  3 siblings, 1 reply; 14+ messages in thread
From: Badari Pulavarty @ 2005-10-04 23:15 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel

On Tue, 2005-10-04 at 21:42 +0200, Christoph Hellwig wrote:
> This patch changes mpage_readpage/mpage_readpages to use a get_blocks
> call that gets the disk mapping information for multiple blocks at the
> same time, similar to the way direct I/O code works.  For extent based
> filesystems like jfs, xfs or reiser4 this allows to reduce the overhead
> of the allocator calls, which is especially nice when that codepath
> is rather heavyweight (as it is for example in xfs).  For filesystems
> that don't have an allocator that make use of it nothing changes.
> 
> I've tested this heavily on XFS with block size = page size and 512 byte
> blocks, and it passes the XFS QA regression test suite fine.  I've done
> some basic fsx testing on all the other affected filesystems.
> 
> 
> Index: linux-2.6/fs/ext3/inode.c
> ===================================================================
> --- linux-2.6.orig/fs/ext3/inode.c	2005-10-04 19:51:00.000000000 +0200
> +++ linux-2.6/fs/ext3/inode.c	2005-10-04 20:47:55.000000000 +0200
> @@ -800,6 +800,18 @@
>  	return ret;
>  }
>  
> +static int
> +ext3_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
> +			struct buffer_head *bh_result, int create)
> +{
> +	int ret;
> +
> +	ret = ext3_get_block(inode, iblock, bh_result, create);
> +	if (ret == 0)
> +		bh_result->b_size = (1 << inode->i_blkbits);
> +	return ret;
> +}
> +
>  #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
>  
>  static int
> @@ -1411,14 +1423,15 @@
>  
>  static int ext3_readpage(struct file *file, struct page *page)
>  {
> -	return mpage_readpage(page, ext3_get_block);
> +	return mpage_readpage(page, ext3_get_blocks, ext3_get_block);
>  }
>  
>  static int
>  ext3_readpages(struct file *file, struct address_space *mapping,
>  		struct list_head *pages, unsigned nr_pages)
>  {
> -	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
> +	return mpage_readpages(mapping, pages, nr_pages,
> +			ext3_get_blocks, ext3_get_block);
>  }
>  
>  static int ext3_invalidatepage(struct page *page, unsigned long offset)
> Index: linux-2.6/fs/mpage.c
> ===================================================================
> --- linux-2.6.orig/fs/mpage.c	2005-10-04 19:51:00.000000000 +0200
> +++ linux-2.6/fs/mpage.c	2005-10-04 20:48:54.000000000 +0200
> @@ -165,7 +165,9 @@
>  
>  static struct bio *
>  do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
> -			sector_t *last_block_in_bio, get_block_t get_block)
> +			sector_t *last_block_in_bio, struct buffer_head *map_bh,
> +			unsigned long *first_logical_block, int *map_valid,
> +			get_blocks_t get_blocks, get_block_t get_block)
>  {
>  	struct inode *inode = page->mapping->host;
>  	const unsigned blkbits = inode->i_blkbits;
> @@ -177,29 +179,63 @@
>  	unsigned page_block;
>  	unsigned first_hole = blocks_per_page;
>  	struct block_device *bdev = NULL;
> -	struct buffer_head bh;
>  	int length;
>  	int fully_mapped = 1;
> +	unsigned nblocks, i;
>  
>  	if (page_has_buffers(page))
>  		goto confused;
>  
>  	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
>  	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
> +	page_block = 0;
> +
> +	/*
> +	 * Map blocks using the result from the last get_blocks call first.
> +	 */
> +	nblocks = map_bh->b_size >> inode->i_blkbits;
> +	if (*map_valid &&
> +	    block_in_file > *first_logical_block &&
> +	    block_in_file < (*first_logical_block + nblocks)) {
> +		unsigned map_offset = block_in_file - *first_logical_block;
> +		unsigned last = nblocks - map_offset;
> +
> +		for (i = 0; ; i++) {
> +			if (i == last) {
> +				*map_valid = 0;
> +				break;
> +			} else if (page_block == blocks_per_page)
> +				break;
> +			blocks[page_block] = map_bh->b_blocknr + map_offset + i;
> +			page_block++;
> +			block_in_file++;
> +		}
> +		bdev = map_bh->b_bdev;
> +	}
> +
> +	/*
> +	 * Then do more get_blocks calls until we are done with this page.
> +	 */
> +	map_bh->b_page = page;
> +	while (page_block < blocks_per_page) {
> +		map_bh->b_state = 0;
> +		map_bh->b_size = 0;
>  
> -	bh.b_page = page;
> -	for (page_block = 0; page_block < blocks_per_page;
> -				page_block++, block_in_file++) {
> -		bh.b_state = 0;
>  		if (block_in_file < last_block) {
> -			if (get_block(inode, block_in_file, &bh, 0))
> +			if (get_blocks(inode, block_in_file,
> +				       last_block - block_in_file, map_bh, 0))
>  				goto confused;
> +			*first_logical_block = block_in_file;
> +			*map_valid  = 1;
>  		}
>  
> -		if (!buffer_mapped(&bh)) {
> +		if (!buffer_mapped(map_bh)) {
>  			fully_mapped = 0;
>  			if (first_hole == blocks_per_page)
>  				first_hole = page_block;
> +			page_block++;
> +			block_in_file++;
> +			*map_valid = 0;
>  			continue;
>  		}
>  
> @@ -209,8 +245,8 @@
>  		 * we just collected from get_block into the page's buffers
>  		 * so readpage doesn't have to repeat the get_block call
>  		 */
> -		if (buffer_uptodate(&bh)) {
> -			map_buffer_to_page(page, &bh, page_block);
> +		if (buffer_uptodate(map_bh)) {
> +			map_buffer_to_page(page, map_bh, page_block);
>  			goto confused;
>  		}
>  	
> @@ -218,10 +254,20 @@
>  			goto confused;		/* hole -> non-hole */
>  
>  		/* Contiguous blocks? */
> -		if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
> +		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
>  			goto confused;
> -		blocks[page_block] = bh.b_blocknr;
> -		bdev = bh.b_bdev;
> +		nblocks = map_bh->b_size >> inode->i_blkbits;
> +		for (i = 0; ; i++) {
> +			if (i == nblocks) {
> +				*map_valid = 0;
> +				break;
> +			} else if (page_block == blocks_per_page)
> +				break;
> +			blocks[page_block] = map_bh->b_blocknr + i;
> +			page_block++;
> +			block_in_file++;
> +		}
> +		bdev = map_bh->b_bdev;
>  	}
>  
>  	if (first_hole != blocks_per_page) {
> @@ -260,7 +306,7 @@
>  		goto alloc_new;
>  	}
>  
> -	if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
> +	if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
>  		bio = mpage_bio_submit(READ, bio);
>  	else
>  		*last_block_in_bio = blocks[blocks_per_page - 1];
> @@ -325,12 +371,16 @@
>   */
>  int
>  mpage_readpages(struct address_space *mapping, struct list_head *pages,
> -				unsigned nr_pages, get_block_t get_block)
> +				unsigned nr_pages, get_blocks_t get_blocks,
> +				get_block_t get_block)
>  {
>  	struct bio *bio = NULL;
>  	unsigned page_idx;
>  	sector_t last_block_in_bio = 0;
>  	struct pagevec lru_pvec;
> +	struct buffer_head map_bh;
> +	unsigned long first_logical_block = 0;
> +	int map_valid = 0;
>  
>  	pagevec_init(&lru_pvec, 0);
>  	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
> @@ -342,7 +392,9 @@
>  					page->index, GFP_KERNEL)) {
>  			bio = do_mpage_readpage(bio, page,
>  					nr_pages - page_idx,
> -					&last_block_in_bio, get_block);
> +					&last_block_in_bio, &map_bh,
> +					&first_logical_block, &map_valid,
> +					get_blocks, get_block);
>  			if (!pagevec_add(&lru_pvec, page))
>  				__pagevec_lru_add(&lru_pvec);
>  		} else {
> @@ -360,13 +412,18 @@
>  /*
>   * This isn't called much at all
>   */
> -int mpage_readpage(struct page *page, get_block_t get_block)
> +int mpage_readpage(struct page *page, get_blocks_t get_blocks,
> +		get_block_t get_block)
>  {
>  	struct bio *bio = NULL;
>  	sector_t last_block_in_bio = 0;
> +	struct buffer_head map_bh;
> +	unsigned long first_logical_block = 0;
> +	int map_valid = 0;
>  
> -	bio = do_mpage_readpage(bio, page, 1,
> -			&last_block_in_bio, get_block);
> +	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
> +			&map_bh, &first_logical_block, &map_valid,
> +			get_blocks, get_block);
>  	if (bio)
>  		mpage_bio_submit(READ, bio);
>  	return 0;
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c	2005-10-04 19:51:00.000000000 +0200
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c	2005-10-04 19:52:49.000000000 +0200
> @@ -1034,6 +1034,18 @@
>  }
>  
>  STATIC int
> +linvfs_get_blocks(
> +	struct inode		*inode,
> +	sector_t		iblock,
> +	unsigned long		max_blocks,
> +	struct buffer_head	*bh_result,
> +	int			create)
> +{
> +	return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
> +					create, 0, BMAPI_WRITE);
> +}
> +
> +STATIC int
>  linvfs_get_blocks_direct(
>  	struct inode		*inode,
>  	sector_t		iblock,
> @@ -1139,7 +1151,7 @@
>  	struct file		*unused,
>  	struct page		*page)
>  {
> -	return mpage_readpage(page, linvfs_get_block);
> +	return mpage_readpage(page, linvfs_get_blocks, linvfs_get_block);
>  }
>  
>  STATIC int
> @@ -1149,7 +1161,8 @@
>  	struct list_head	*pages,
>  	unsigned		nr_pages)
>  {
> -	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
> +	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_blocks,
> +			linvfs_get_block);
>  }
>  
>  STATIC void
> Index: linux-2.6/include/linux/mpage.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mpage.h	2005-10-04 19:51:00.000000000 +0200
> +++ linux-2.6/include/linux/mpage.h	2005-10-04 19:52:49.000000000 +0200
> @@ -14,8 +14,9 @@
>  typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
>  
>  int mpage_readpages(struct address_space *mapping, struct list_head *pages,
> -				unsigned nr_pages, get_block_t get_block);
> -int mpage_readpage(struct page *page, get_block_t get_block);
> +				unsigned nr_pages, get_blocks_t get_blocks,
> +				get_block_t get_block);
> +int mpage_readpage(struct page *page, get_blocks_t get_blocks, get_block_t get_block);
>  int mpage_writepages(struct address_space *mapping,
>  		struct writeback_control *wbc, get_block_t get_block);
>  int mpage_writepage(struct page *page, get_block_t *get_block,

Christoph,

Do you really want mpage_readpage()/mpage_readpages() to take
get_blocks() and get_block() to take as arguments ? Or it just
a initial version and you have grand plans to clean it up ?
Me & Suparna did a version which needed only get_blocks() 
few months ago. I can try to dig it up, if you like.

In fact, we want to do the same thing for writepages() also - so
that the filesystem can do the block allocation in a single shot.
(as part of delayed allocation).

Thanks,
Badari



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-04 19:42 [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s) Christoph Hellwig
  2005-10-04 22:05 ` Dave Kleikamp
  2005-10-04 23:15 ` Badari Pulavarty
@ 2005-10-05  6:04 ` Nikita Danilov
  2005-10-05  7:10   ` Christoph Hellwig
  2005-10-19 11:56 ` Christoph Hellwig
  3 siblings, 1 reply; 14+ messages in thread
From: Nikita Danilov @ 2005-10-05  6:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel

Christoph Hellwig writes:

[...]

 >  					page->index, GFP_KERNEL)) {
 >  			bio = do_mpage_readpage(bio, page,
 >  					nr_pages - page_idx,
 > -					&last_block_in_bio, get_block);
 > +					&last_block_in_bio, &map_bh,
 > +					&first_logical_block, &map_valid,
 > +					get_blocks, get_block);

Oh, function taking 9 arguments, please... this is not windows.

By the way, I cannot see where get_block callback is used by mpage.c
code, why is it needed?

Nikita.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-05  6:04 ` Nikita Danilov
@ 2005-10-05  7:10   ` Christoph Hellwig
  2005-10-05 16:11     ` Suparna Bhattacharya
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2005-10-05  7:10 UTC (permalink / raw)
  To: Nikita Danilov; +Cc: linux-fsdevel

On Wed, Oct 05, 2005 at 10:04:48AM +0400, Nikita Danilov wrote:
>  >  					page->index, GFP_KERNEL)) {
>  >  			bio = do_mpage_readpage(bio, page,
>  >  					nr_pages - page_idx,
>  > -					&last_block_in_bio, get_block);
>  > +					&last_block_in_bio, &map_bh,
>  > +					&first_logical_block, &map_valid,
>  > +					get_blocks, get_block);
> 
> Oh, function taking 9 arguments, please... this is not windows.

I'm happy about suggestion to reduce the count.  One of the real
problems is that the buffer_head structure is extremly badly suited for
the get_blocks callback.  Having information like in the xfs_iomap_t
structure (which is XFS bmap information container) would simplify lots
of these things.


> By the way, I cannot see where get_block callback is used by mpage.c
> code, why is it needed?

It's only used by block_read_full_page which is called in the 'confused'
case.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-04 23:15 ` Badari Pulavarty
@ 2005-10-05  7:14   ` Christoph Hellwig
  2005-10-07 22:13     ` Badari Pulavarty
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2005-10-05  7:14 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: Christoph Hellwig, linux-fsdevel

On Tue, Oct 04, 2005 at 04:15:43PM -0700, Badari Pulavarty wrote:
> Do you really want mpage_readpage()/mpage_readpages() to take
> get_blocks() and get_block() to take as arguments ? Or it just
> a initial version and you have grand plans to clean it up ?
> Me & Suparna did a version which needed only get_blocks() 
> few months ago. I can try to dig it up, if you like.

I don't see how to get rid of it easily.  It's need for the confused
case that calls block_read_full_page, which needs a get_block()
callback.  Adding a failback_writepage callback would be a tad nicer
but not really solve the general issue.

> In fact, we want to do the same thing for writepages() also - so
> that the filesystem can do the block allocation in a single shot.
> (as part of delayed allocation).

I'm doing something like that internally for XFS right now.  Due to
thinks like delayed allocations, unwritten extents and probing ahead
unmapped blocks (for the mmap write case) we can't use the common
code currently - but I plan to get it closer to it so eventually we
could make the generic code handle all that.  It's just a long term
TODO list item, not a short term one.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-05  7:10   ` Christoph Hellwig
@ 2005-10-05 16:11     ` Suparna Bhattacharya
  0 siblings, 0 replies; 14+ messages in thread
From: Suparna Bhattacharya @ 2005-10-05 16:11 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Nikita Danilov, linux-fsdevel

On Wed, Oct 05, 2005 at 09:10:37AM +0200, Christoph Hellwig wrote:
> On Wed, Oct 05, 2005 at 10:04:48AM +0400, Nikita Danilov wrote:
> >  >  					page->index, GFP_KERNEL)) {
> >  >  			bio = do_mpage_readpage(bio, page,
> >  >  					nr_pages - page_idx,
> >  > -					&last_block_in_bio, get_block);
> >  > +					&last_block_in_bio, &map_bh,
> >  > +					&first_logical_block, &map_valid,
> >  > +					get_blocks, get_block);
> > 
> > Oh, function taking 9 arguments, please... this is not windows.
> 
> I'm happy about suggestion to reduce the count.  One of the real
> problems is that the buffer_head structure is extremly badly suited for
> the get_blocks callback.  Having information like in the xfs_iomap_t
> structure (which is XFS bmap information container) would simplify lots
> of these things.

In the writepages case, I ultimately ended up defining an mpageio
structure somewhat like the DIO code.

> 
> 
> > By the way, I cannot see where get_block callback is used by mpage.c
> > code, why is it needed?
> 
> It's only used by block_read_full_page which is called in the 'confused'
> case.

Having a readpage_fn callback may be a little bit better than this, but
yes this is when one really starts wanting to get rid of bufferheads ...

> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Regards
Suparna

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-05  7:14   ` Christoph Hellwig
@ 2005-10-07 22:13     ` Badari Pulavarty
  0 siblings, 0 replies; 14+ messages in thread
From: Badari Pulavarty @ 2005-10-07 22:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel

On Wed, 2005-10-05 at 09:14 +0200, Christoph Hellwig wrote:
> On Tue, Oct 04, 2005 at 04:15:43PM -0700, Badari Pulavarty wrote:
> > Do you really want mpage_readpage()/mpage_readpages() to take
> > get_blocks() and get_block() to take as arguments ? Or it just
> > a initial version and you have grand plans to clean it up ?
> > Me & Suparna did a version which needed only get_blocks() 
> > few months ago. I can try to dig it up, if you like.
> 
> I don't see how to get rid of it easily.  It's need for the confused
> case that calls block_read_full_page, which needs a get_block()
> callback.  Adding a failback_writepage callback would be a tad nicer
> but not really solve the general issue.

I looked at my patches. I changed block_read_full_page() to take
get_blocks() also, so I didn't need get_block(). I modified ONLY 
the filesystems I cared to test :( 

Well, doing this in general touches lots of filesystems & files +
breaks out of tree filesystems. So, for now I am happy with your
patch.

Thanks,
Badari


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-04 19:42 [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s) Christoph Hellwig
                   ` (2 preceding siblings ...)
  2005-10-05  6:04 ` Nikita Danilov
@ 2005-10-19 11:56 ` Christoph Hellwig
  2005-10-19 15:07   ` Dave Kleikamp
  2005-10-20 22:06   ` Andrew Morton
  3 siblings, 2 replies; 14+ messages in thread
From: Christoph Hellwig @ 2005-10-19 11:56 UTC (permalink / raw)
  To: akpm; +Cc: linux-fsdevel

On Tue, Oct 04, 2005 at 09:42:46PM +0200, Christoph Hellwig wrote:
> This patch changes mpage_readpage/mpage_readpages to use a get_blocks
> call that gets the disk mapping information for multiple blocks at the
> same time, similar to the way direct I/O code works.  For extent based
> filesystems like jfs, xfs or reiser4 this allows to reduce the overhead
> of the allocator calls, which is especially nice when that codepath
> is rather heavyweight (as it is for example in xfs).  For filesystems
> that don't have an allocator that make use of it nothing changes.
> 
> I've tested this heavily on XFS with block size = page size and 512 byte
> blocks, and it passes the XFS QA regression test suite fine.  I've done
> some basic fsx testing on all the other affected filesystems.

I think the patch should go into -mm after no one had serious objections.


Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: linux-2.6/fs/ext3/inode.c
===================================================================
--- linux-2.6.orig/fs/ext3/inode.c	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/fs/ext3/inode.c	2005-10-04 20:47:55.000000000 +0200
@@ -800,6 +800,18 @@
 	return ret;
 }
 
+static int
+ext3_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
+			struct buffer_head *bh_result, int create)
+{
+	int ret;
+
+	ret = ext3_get_block(inode, iblock, bh_result, create);
+	if (ret == 0)
+		bh_result->b_size = (1 << inode->i_blkbits);
+	return ret;
+}
+
 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
 
 static int
@@ -1411,14 +1423,15 @@
 
 static int ext3_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, ext3_get_block);
+	return mpage_readpage(page, ext3_get_blocks, ext3_get_block);
 }
 
 static int
 ext3_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
+	return mpage_readpages(mapping, pages, nr_pages,
+			ext3_get_blocks, ext3_get_block);
 }
 
 static int ext3_invalidatepage(struct page *page, unsigned long offset)
Index: linux-2.6/fs/mpage.c
===================================================================
--- linux-2.6.orig/fs/mpage.c	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/fs/mpage.c	2005-10-04 20:48:54.000000000 +0200
@@ -165,7 +165,9 @@
 
 static struct bio *
 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
-			sector_t *last_block_in_bio, get_block_t get_block)
+			sector_t *last_block_in_bio, struct buffer_head *map_bh,
+			unsigned long *first_logical_block, int *map_valid,
+			get_blocks_t get_blocks, get_block_t get_block)
 {
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
@@ -177,29 +179,63 @@
 	unsigned page_block;
 	unsigned first_hole = blocks_per_page;
 	struct block_device *bdev = NULL;
-	struct buffer_head bh;
 	int length;
 	int fully_mapped = 1;
+	unsigned nblocks, i;
 
 	if (page_has_buffers(page))
 		goto confused;
 
 	block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
 	last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;
+	page_block = 0;
+
+	/*
+	 * Map blocks using the result from the last get_blocks call first.
+	 */
+	nblocks = map_bh->b_size >> inode->i_blkbits;
+	if (*map_valid &&
+	    block_in_file > *first_logical_block &&
+	    block_in_file < (*first_logical_block + nblocks)) {
+		unsigned map_offset = block_in_file - *first_logical_block;
+		unsigned last = nblocks - map_offset;
+
+		for (i = 0; ; i++) {
+			if (i == last) {
+				*map_valid = 0;
+				break;
+			} else if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr + map_offset + i;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
+	}
+
+	/*
+	 * Then do more get_blocks calls until we are done with this page.
+	 */
+	map_bh->b_page = page;
+	while (page_block < blocks_per_page) {
+		map_bh->b_state = 0;
+		map_bh->b_size = 0;
 
-	bh.b_page = page;
-	for (page_block = 0; page_block < blocks_per_page;
-				page_block++, block_in_file++) {
-		bh.b_state = 0;
 		if (block_in_file < last_block) {
-			if (get_block(inode, block_in_file, &bh, 0))
+			if (get_blocks(inode, block_in_file,
+				       last_block - block_in_file, map_bh, 0))
 				goto confused;
+			*first_logical_block = block_in_file;
+			*map_valid  = 1;
 		}
 
-		if (!buffer_mapped(&bh)) {
+		if (!buffer_mapped(map_bh)) {
 			fully_mapped = 0;
 			if (first_hole == blocks_per_page)
 				first_hole = page_block;
+			page_block++;
+			block_in_file++;
+			*map_valid = 0;
 			continue;
 		}
 
@@ -209,8 +245,8 @@
 		 * we just collected from get_block into the page's buffers
 		 * so readpage doesn't have to repeat the get_block call
 		 */
-		if (buffer_uptodate(&bh)) {
-			map_buffer_to_page(page, &bh, page_block);
+		if (buffer_uptodate(map_bh)) {
+			map_buffer_to_page(page, map_bh, page_block);
 			goto confused;
 		}
 	
@@ -218,10 +254,20 @@
 			goto confused;		/* hole -> non-hole */
 
 		/* Contiguous blocks? */
-		if (page_block && blocks[page_block-1] != bh.b_blocknr-1)
+		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
 			goto confused;
-		blocks[page_block] = bh.b_blocknr;
-		bdev = bh.b_bdev;
+		nblocks = map_bh->b_size >> inode->i_blkbits;
+		for (i = 0; ; i++) {
+			if (i == nblocks) {
+				*map_valid = 0;
+				break;
+			} else if (page_block == blocks_per_page)
+				break;
+			blocks[page_block] = map_bh->b_blocknr + i;
+			page_block++;
+			block_in_file++;
+		}
+		bdev = map_bh->b_bdev;
 	}
 
 	if (first_hole != blocks_per_page) {
@@ -260,7 +306,7 @@
 		goto alloc_new;
 	}
 
-	if (buffer_boundary(&bh) || (first_hole != blocks_per_page))
+	if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
 		bio = mpage_bio_submit(READ, bio);
 	else
 		*last_block_in_bio = blocks[blocks_per_page - 1];
@@ -325,12 +371,16 @@
  */
 int
 mpage_readpages(struct address_space *mapping, struct list_head *pages,
-				unsigned nr_pages, get_block_t get_block)
+				unsigned nr_pages, get_blocks_t get_blocks,
+				get_block_t get_block)
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	sector_t last_block_in_bio = 0;
 	struct pagevec lru_pvec;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+	int map_valid = 0;
 
 	pagevec_init(&lru_pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -342,7 +392,9 @@
 					page->index, GFP_KERNEL)) {
 			bio = do_mpage_readpage(bio, page,
 					nr_pages - page_idx,
-					&last_block_in_bio, get_block);
+					&last_block_in_bio, &map_bh,
+					&first_logical_block, &map_valid,
+					get_blocks, get_block);
 			if (!pagevec_add(&lru_pvec, page))
 				__pagevec_lru_add(&lru_pvec);
 		} else {
@@ -360,13 +412,18 @@
 /*
  * This isn't called much at all
  */
-int mpage_readpage(struct page *page, get_block_t get_block)
+int mpage_readpage(struct page *page, get_blocks_t get_blocks,
+		get_block_t get_block)
 {
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
+	struct buffer_head map_bh;
+	unsigned long first_logical_block = 0;
+	int map_valid = 0;
 
-	bio = do_mpage_readpage(bio, page, 1,
-			&last_block_in_bio, get_block);
+	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
+			&map_bh, &first_logical_block, &map_valid,
+			get_blocks, get_block);
 	if (bio)
 		mpage_bio_submit(READ, bio);
 	return 0;
Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c	2005-10-04 19:52:49.000000000 +0200
@@ -1034,6 +1034,18 @@
 }
 
 STATIC int
+linvfs_get_blocks(
+	struct inode		*inode,
+	sector_t		iblock,
+	unsigned long		max_blocks,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
+					create, 0, BMAPI_WRITE);
+}
+
+STATIC int
 linvfs_get_blocks_direct(
 	struct inode		*inode,
 	sector_t		iblock,
@@ -1139,7 +1151,7 @@
 	struct file		*unused,
 	struct page		*page)
 {
-	return mpage_readpage(page, linvfs_get_block);
+	return mpage_readpage(page, linvfs_get_blocks, linvfs_get_block);
 }
 
 STATIC int
@@ -1149,7 +1161,8 @@
 	struct list_head	*pages,
 	unsigned		nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
+	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_blocks,
+			linvfs_get_block);
 }
 
 STATIC void
Index: linux-2.6/include/linux/mpage.h
===================================================================
--- linux-2.6.orig/include/linux/mpage.h	2005-10-04 19:51:00.000000000 +0200
+++ linux-2.6/include/linux/mpage.h	2005-10-04 19:52:49.000000000 +0200
@@ -14,8 +14,9 @@
 typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
 
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
-				unsigned nr_pages, get_block_t get_block);
-int mpage_readpage(struct page *page, get_block_t get_block);
+				unsigned nr_pages, get_blocks_t get_blocks,
+				get_block_t get_block);
+int mpage_readpage(struct page *page, get_blocks_t get_blocks, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
Index: linux-2.6/fs/ext2/inode.c
===================================================================
--- linux-2.6.orig/fs/ext2/inode.c	2005-10-04 20:47:05.000000000 +0200
+++ linux-2.6/fs/ext2/inode.c	2005-10-04 21:22:43.000000000 +0200
@@ -621,6 +621,18 @@
 	goto reread;
 }
 
+static int
+ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
+			struct buffer_head *bh_result, int create)
+{
+	int ret;
+
+	ret = ext2_get_block(inode, iblock, bh_result, create);
+	if (ret == 0)
+		bh_result->b_size = (1 << inode->i_blkbits);
+	return ret;
+}
+
 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, ext2_get_block, wbc);
@@ -628,14 +640,15 @@
 
 static int ext2_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, ext2_get_block);
+	return mpage_readpage(page, ext2_get_blocks, ext2_get_block);
 }
 
 static int
 ext2_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
+	return mpage_readpages(mapping, pages, nr_pages,
+			ext2_get_blocks, ext2_get_block);
 }
 
 static int
@@ -663,18 +676,6 @@
 	return generic_block_bmap(mapping,block,ext2_get_block);
 }
 
-static int
-ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long max_blocks,
-			struct buffer_head *bh_result, int create)
-{
-	int ret;
-
-	ret = ext2_get_block(inode, iblock, bh_result, create);
-	if (ret == 0)
-		bh_result->b_size = (1 << inode->i_blkbits);
-	return ret;
-}
-
 static ssize_t
 ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
Index: linux-2.6/fs/jfs/inode.c
===================================================================
--- linux-2.6.orig/fs/jfs/inode.c	2005-10-04 20:47:05.000000000 +0200
+++ linux-2.6/fs/jfs/inode.c	2005-10-04 20:47:31.000000000 +0200
@@ -273,13 +273,14 @@
 
 static int jfs_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, jfs_get_block);
+	return mpage_readpage(page, jfs_get_blocks, jfs_get_block);
 }
 
 static int jfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
+	return mpage_readpages(mapping, pages, nr_pages, jfs_get_blocks,
+			jfs_get_block);
 }
 
 static int jfs_prepare_write(struct file *file,
Index: linux-2.6/fs/reiserfs/inode.c
===================================================================
--- linux-2.6.orig/fs/reiserfs/inode.c	2005-10-04 20:47:05.000000000 +0200
+++ linux-2.6/fs/reiserfs/inode.c	2005-10-04 20:47:31.000000000 +0200
@@ -1049,10 +1049,24 @@
 }
 
 static int
+reiserfs_get_blocks(struct inode *inode, sector_t iblock,
+		unsigned long max_blocks, struct buffer_head *bh_result,
+		int create)
+{
+	int ret;
+
+	ret = reiserfs_get_block(inode, iblock, bh_result, create);
+	if (ret == 0)
+		bh_result->b_size = (1 << inode->i_blkbits);
+	return ret;
+}
+
+static int
 reiserfs_readpages(struct file *file, struct address_space *mapping,
 		   struct list_head *pages, unsigned nr_pages)
 {
-	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_blocks,
+			reiserfs_get_block);
 }
 
 /* Compute real number of used bytes by file

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-19 11:56 ` Christoph Hellwig
@ 2005-10-19 15:07   ` Dave Kleikamp
  2005-10-20 22:06   ` Andrew Morton
  1 sibling, 0 replies; 14+ messages in thread
From: Dave Kleikamp @ 2005-10-19 15:07 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: akpm, linux-fsdevel

On Wed, 2005-10-19 at 13:56 +0200, Christoph Hellwig wrote:
> On Tue, Oct 04, 2005 at 09:42:46PM +0200, Christoph Hellwig wrote:
> > This patch changes mpage_readpage/mpage_readpages to use a get_blocks
> > call that gets the disk mapping information for multiple blocks at the
> > same time, similar to the way direct I/O code works.  For extent based
> > filesystems like jfs, xfs or reiser4 this allows to reduce the overhead
> > of the allocator calls, which is especially nice when that codepath
> > is rather heavyweight (as it is for example in xfs).  For filesystems
> > that don't have an allocator that make use of it nothing changes.
> > 
> > I've tested this heavily on XFS with block size = page size and 512 byte
> > blocks, and it passes the XFS QA regression test suite fine.  I've done
> > some basic fsx testing on all the other affected filesystems.
> 
> I think the patch should go into -mm after no one had serious objections.

ACK.  Tested jfs with this patch on top of -rc4-mm1.

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

-- 
David Kleikamp
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-19 11:56 ` Christoph Hellwig
  2005-10-19 15:07   ` Dave Kleikamp
@ 2005-10-20 22:06   ` Andrew Morton
  2005-10-21  6:18     ` Suparna Bhattacharya
  2005-10-21 12:53     ` Dave Kleikamp
  1 sibling, 2 replies; 14+ messages in thread
From: Andrew Morton @ 2005-10-20 22:06 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel

Christoph Hellwig <hch@lst.de> wrote:
>
> On Tue, Oct 04, 2005 at 09:42:46PM +0200, Christoph Hellwig wrote:
>  > This patch changes mpage_readpage/mpage_readpages to use a get_blocks
>  > call that gets the disk mapping information for multiple blocks at the
>  > same time, similar to the way direct I/O code works.  For extent based
>  > filesystems like jfs, xfs or reiser4 this allows to reduce the overhead
>  > of the allocator calls, which is especially nice when that codepath
>  > is rather heavyweight (as it is for example in xfs).  For filesystems
>  > that don't have an allocator that make use of it nothing changes.
>  > 
>  > I've tested this heavily on XFS with block size = page size and 512 byte
>  > blocks, and it passes the XFS QA regression test suite fine.  I've done
>  > some basic fsx testing on all the other affected filesystems.
> 
>  I think the patch should go into -mm after no one had serious objections.

It adds a bunch of complexity and overhead which will speed up a filesystem
which few people use while slowing down the filesystems which most people
use.

If/when we have an ext3 implementation of get_blocks() and some benchmarks
which show it's worthwhile then OK.

Could we please see the xfs benchmark results?

first_logical_block should have type sector_t.

Did you consider aggregating those nine args to do_mpage_readpage() into a
struct, passing the address of that?


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-20 22:06   ` Andrew Morton
@ 2005-10-21  6:18     ` Suparna Bhattacharya
  2005-10-21 12:53     ` Dave Kleikamp
  1 sibling, 0 replies; 14+ messages in thread
From: Suparna Bhattacharya @ 2005-10-21  6:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Christoph Hellwig, linux-fsdevel

On Thu, Oct 20, 2005 at 03:06:31PM -0700, Andrew Morton wrote:
> Christoph Hellwig <hch@lst.de> wrote:
> >
> > On Tue, Oct 04, 2005 at 09:42:46PM +0200, Christoph Hellwig wrote:
> >  > This patch changes mpage_readpage/mpage_readpages to use a get_blocks
> >  > call that gets the disk mapping information for multiple blocks at the
> >  > same time, similar to the way direct I/O code works.  For extent based
> >  > filesystems like jfs, xfs or reiser4 this allows to reduce the overhead
> >  > of the allocator calls, which is especially nice when that codepath
> >  > is rather heavyweight (as it is for example in xfs).  For filesystems
> >  > that don't have an allocator that make use of it nothing changes.
> >  > 
> >  > I've tested this heavily on XFS with block size = page size and 512 byte
> >  > blocks, and it passes the XFS QA regression test suite fine.  I've done
> >  > some basic fsx testing on all the other affected filesystems.
> > 
> >  I think the patch should go into -mm after no one had serious objections.
> 
> It adds a bunch of complexity and overhead which will speed up a filesystem
> which few people use while slowing down the filesystems which most people
> use.
> 
> If/when we have an ext3 implementation of get_blocks() and some benchmarks
> which show it's worthwhile then OK.

We do already have this based on reservation based allocation (Mingming
posted the patches a while back). I think we need to combine the work we did
for mpage_writepages with the readpages work.

> 
> Could we please see the xfs benchmark results?
> 
> first_logical_block should have type sector_t.
> 
> Did you consider aggregating those nine args to do_mpage_readpage() into a
> struct, passing the address of that?

Regards
Suparna

> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Suparna Bhattacharya (suparna@in.ibm.com)
Linux Technology Center
IBM Software Lab, India


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-20 22:06   ` Andrew Morton
  2005-10-21  6:18     ` Suparna Bhattacharya
@ 2005-10-21 12:53     ` Dave Kleikamp
  2005-10-21 14:16       ` Jörn Engel
  1 sibling, 1 reply; 14+ messages in thread
From: Dave Kleikamp @ 2005-10-21 12:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Christoph Hellwig, linux-fsdevel

On Thu, 2005-10-20 at 15:06 -0700, Andrew Morton wrote:
> Christoph Hellwig <hch@lst.de> wrote:
> >  I think the patch should go into -mm after no one had serious objections.
> 
> It adds a bunch of complexity and overhead which will speed up a filesystem
> which few people use while slowing down the filesystems which most people
> use.

It will speed up 3 filesystems which a significant number of people do
use.  I doubt the overhead to ext3 is even measurable, and I'd venture
to say that the majority using ext3 aren't using it because of its
speed.

> If/when we have an ext3 implementation of get_blocks() and some benchmarks
> which show it's worthwhile then OK.

Having Christoph's mpage_readpage patch in the kernel would definitely
be an incentive to someone implementing get_blocks in ext3.

> Could we please see the xfs benchmark results?
> 
> first_logical_block should have type sector_t.
> 
> Did you consider aggregating those nine args to do_mpage_readpage() into a
> struct, passing the address of that?

I was a little take aback with the long argument list too.  I would like
the patch to be accepted in one form or another.
-- 
David Kleikamp
IBM Linux Technology Center


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s)
  2005-10-21 12:53     ` Dave Kleikamp
@ 2005-10-21 14:16       ` Jörn Engel
  0 siblings, 0 replies; 14+ messages in thread
From: Jörn Engel @ 2005-10-21 14:16 UTC (permalink / raw)
  To: Dave Kleikamp; +Cc: Andrew Morton, Christoph Hellwig, linux-fsdevel

On Fri, 21 October 2005 07:53:12 -0500, Dave Kleikamp wrote:
> 
> It will speed up 3 filesystems which a significant number of people do
> use.  I doubt the overhead to ext3 is even measurable, and I'd venture
> to say that the majority using ext3 aren't using it because of its
> speed.

Yet they are very pleased with ext3 not being any slower.  Favoring
stability/simplicity/trust/etc. over speed is not equivalent to not
caring about speed at all.

That said, I agree with the other arguments.  In particular, I don't
mind ext3 being slightly slower for a while and getting noticeably
faster afterwards.

Jörn

-- 
ticks = jiffies;
while (ticks == jiffies);
ticks = jiffies;
-- /usr/src/linux/init/main.c
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2005-10-21 14:17 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-04 19:42 [PATCH, RFC] map multiple blocks at a time in mpage_readpage(s) Christoph Hellwig
2005-10-04 22:05 ` Dave Kleikamp
2005-10-04 23:15 ` Badari Pulavarty
2005-10-05  7:14   ` Christoph Hellwig
2005-10-07 22:13     ` Badari Pulavarty
2005-10-05  6:04 ` Nikita Danilov
2005-10-05  7:10   ` Christoph Hellwig
2005-10-05 16:11     ` Suparna Bhattacharya
2005-10-19 11:56 ` Christoph Hellwig
2005-10-19 15:07   ` Dave Kleikamp
2005-10-20 22:06   ` Andrew Morton
2005-10-21  6:18     ` Suparna Bhattacharya
2005-10-21 12:53     ` Dave Kleikamp
2005-10-21 14:16       ` Jörn Engel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).