cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] [PATCH] GFS2: rewrite fallocate code to write blocks directly
@ 2011-09-12 23:15 Benjamin Marzinski
  2011-09-18 13:45 ` Steven Whitehouse
  0 siblings, 1 reply; 2+ messages in thread
From: Benjamin Marzinski @ 2011-09-12 23:15 UTC (permalink / raw)
  To: cluster-devel.redhat.com

GFS2's fallocate code currently goes through the page cache. Since it's only
writing to the end of the file or to holes in it, it doesn't need to, and it
was causing issues on low memory environments. This patch pulls in some of
Steve's block allocation work, and uses it to simply allocate the blocks for
the file, and zero them out at allocation time.  It provides a slight
performance increase, and it dramatically simplifies the code.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
 fs/gfs2/bmap.c   |   12 +++
 fs/gfs2/file.c   |  171 +++++++------------------------------------------------
 fs/gfs2/incore.h |    3 
 3 files changed, 39 insertions(+), 147 deletions(-)

Index: gfs2-2.6-nmw/fs/gfs2/file.c
===================================================================
--- gfs2-2.6-nmw.orig/fs/gfs2/file.c
+++ gfs2-2.6-nmw/fs/gfs2/file.c
@@ -669,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struc
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
-static int empty_write_end(struct page *page, unsigned from,
-			   unsigned to, int mode)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *bh;
-	unsigned offset, blksize = 1 << inode->i_blkbits;
-	pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
-
-	zero_user(page, from, to-from);
-	mark_page_accessed(page);
-
-	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
-		if (!gfs2_is_writeback(ip))
-			gfs2_page_add_databufs(ip, page, from, to);
-
-		block_commit_write(page, from, to);
-		return 0;
-	}
-
-	offset = 0;
-	bh = page_buffers(page);
-	while (offset < to) {
-		if (offset >= from) {
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
-			clear_buffer_new(bh);
-			write_dirty_buffer(bh, WRITE);
-		}
-		offset += blksize;
-		bh = bh->b_this_page;
-	}
-
-	offset = 0;
-	bh = page_buffers(page);
-	while (offset < to) {
-		if (offset >= from) {
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				return -EIO;
-		}
-		offset += blksize;
-		bh = bh->b_this_page;
-	}
-	return 0;
-}
-
-static int needs_empty_write(sector_t block, struct inode *inode)
-{
-	int error;
-	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-
-	bh_map.b_size = 1 << inode->i_blkbits;
-	error = gfs2_block_map(inode, block, &bh_map, 0);
-	if (unlikely(error))
-		return error;
-	return !buffer_mapped(&bh_map);
-}
-
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
-			      int mode)
-{
-	struct inode *inode = page->mapping->host;
-	unsigned start, end, next, blksize;
-	sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	int ret;
-
-	blksize = 1 << inode->i_blkbits;
-	next = end = 0;
-	while (next < from) {
-		next += blksize;
-		block++;
-	}
-	start = next;
-	do {
-		next += blksize;
-		ret = needs_empty_write(block, inode);
-		if (unlikely(ret < 0))
-			return ret;
-		if (ret == 0) {
-			if (end) {
-				ret = __block_write_begin(page, start, end - start,
-							  gfs2_block_map);
-				if (unlikely(ret))
-					return ret;
-				ret = empty_write_end(page, start, end, mode);
-				if (unlikely(ret))
-					return ret;
-				end = 0;
-			}
-			start = next;
-		}
-		else
-			end = next;
-		block++;
-	} while (next < to);
-
-	if (end) {
-		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
-		if (unlikely(ret))
-			return ret;
-		ret = empty_write_end(page, start, end, mode);
-		if (unlikely(ret))
-			return ret;
-	}
-
-	return 0;
-}
-
 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
 			   int mode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct buffer_head *dibh;
 	int error;
-	u64 start = offset >> PAGE_CACHE_SHIFT;
-	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
-	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
-	pgoff_t curr;
-	struct page *page;
-	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
-	unsigned int from, to;
-
-	if (!end_offset)
-		end_offset = PAGE_CACHE_SIZE;
+	unsigned int nr_blks;
+	sector_t lblock = offset >> inode->i_blkbits;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (unlikely(error))
-		goto out;
+		return error;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 
@@ -807,39 +690,31 @@ static int fallocate_chunk(struct inode 
 			goto out;
 	}
 
-	curr = start;
-	offset = start << PAGE_CACHE_SHIFT;
-	from = start_offset;
-	to = PAGE_CACHE_SIZE;
-	while (curr <= end) {
-		page = grab_cache_page_write_begin(inode->i_mapping, curr,
-						   AOP_FLAG_NOFS);
-		if (unlikely(!page)) {
-			error = -ENOMEM;
-			goto out;
-		}
+	while (len) {
+		struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+		bh_map.b_size = len;
+		set_buffer_zeronew(&bh_map);
 
-		if (curr == end)
-			to = end_offset;
-		error = write_empty_blocks(page, from, to, mode);
-		if (!error && offset + to > inode->i_size &&
-		    !(mode & FALLOC_FL_KEEP_SIZE)) {
-			i_size_write(inode, offset + to);
-		}
-		unlock_page(page);
-		page_cache_release(page);
-		if (error)
+		error = gfs2_block_map(inode, lblock, &bh_map, 1);
+		if (unlikely(error))
 			goto out;
-		curr++;
-		offset += PAGE_CACHE_SIZE;
-		from = 0;
+		len -= bh_map.b_size;
+		nr_blks = bh_map.b_size >> inode->i_blkbits;
+		lblock += nr_blks;
+		if (!buffer_new(&bh_map))
+			continue;
+		if (unlikely(!buffer_zeronew(&bh_map))) {
+			error = -EIO;
+			goto out;
+		}
 	}
+	if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
+		i_size_write(inode, offset + len);
 
 	mark_inode_dirty(inode);
 
-	brelse(dibh);
-
 out:
+	brelse(dibh);
 	return error;
 }
 
@@ -879,6 +754,7 @@ static long gfs2_fallocate(struct file *
 	int error;
 	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
 	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+	loff_t max_chunk_size = UINT_MAX & bsize_mask;
 	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
 
 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -932,7 +808,8 @@ retry:
 			goto out_qunlock;
 		}
 		max_bytes = bytes;
-		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+		calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
+				&max_bytes, &data_blocks, &ind_blocks);
 		al->al_requested = data_blocks + ind_blocks;
 
 		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
Index: gfs2-2.6-nmw/fs/gfs2/bmap.c
===================================================================
--- gfs2-2.6-nmw.orig/fs/gfs2/bmap.c
+++ gfs2-2.6-nmw/fs/gfs2/bmap.c
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 
@@ -427,12 +428,14 @@ static int gfs2_bmap_alloc(struct inode 
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct super_block *sb = sdp->sd_vfs;
 	struct buffer_head *dibh = mp->mp_bh[0];
 	u64 bn, dblock = 0;
 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 	unsigned dblks = 0;
 	unsigned ptrs_per_blk;
 	const unsigned end_of_metadata = height - 1;
+	int ret;
 	int eob = 0;
 	enum alloc_state state;
 	__be64 *ptr;
@@ -535,6 +538,15 @@ static int gfs2_bmap_alloc(struct inode 
 			dblock = bn;
 			while (n-- > 0)
 				*ptr++ = cpu_to_be64(bn++);
+			if (buffer_zeronew(bh_map)) {
+				ret = sb_issue_zeroout(sb, dblock, dblks,
+						       GFP_NOFS);
+				if (ret) {
+					fs_err(sdp,
+					       "Failed to zero data buffers\n");
+					clear_buffer_zeronew(bh_map);
+				}
+			}
 			break;
 		}
 	} while ((state != ALLOC_DATA) || !dblock);
Index: gfs2-2.6-nmw/fs/gfs2/incore.h
===================================================================
--- gfs2-2.6-nmw.orig/fs/gfs2/incore.h
+++ gfs2-2.6-nmw/fs/gfs2/incore.h
@@ -103,12 +103,15 @@ struct gfs2_rgrpd {
 enum gfs2_state_bits {
 	BH_Pinned = BH_PrivateStart,
 	BH_Escaped = BH_PrivateStart + 1,
+	BH_Zeronew = BH_PrivateStart + 2,
 };
 
 BUFFER_FNS(Pinned, pinned)
 TAS_BUFFER_FNS(Pinned, pinned)
 BUFFER_FNS(Escaped, escaped)
 TAS_BUFFER_FNS(Escaped, escaped)
+BUFFER_FNS(Zeronew, zeronew)
+TAS_BUFFER_FNS(Zeronew, zeronew)
 
 struct gfs2_bufdata {
 	struct buffer_head *bd_bh;



^ permalink raw reply	[flat|nested] 2+ messages in thread

* [Cluster-devel] [PATCH] GFS2: rewrite fallocate code to write blocks directly
  2011-09-12 23:15 [Cluster-devel] [PATCH] GFS2: rewrite fallocate code to write blocks directly Benjamin Marzinski
@ 2011-09-18 13:45 ` Steven Whitehouse
  0 siblings, 0 replies; 2+ messages in thread
From: Steven Whitehouse @ 2011-09-18 13:45 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This looks really good. Sorry for not spotting it first time around - I
think it got lost in the backlog when I came back from holiday. Its now
in the -nmw tree. Thanks,

Steve.

On Mon, 2011-09-12 at 18:15 -0500, Benjamin Marzinski wrote:
> GFS2's fallocate code currently goes through the page cache. Since it's only
> writing to the end of the file or to holes in it, it doesn't need to, and it
> was causing issues on low memory environments. This patch pulls in some of
> Steve's block allocation work, and uses it to simply allocate the blocks for
> the file, and zero them out at allocation time.  It provides a slight
> performance increase, and it dramatically simplifies the code.
> 
> Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> ---
>  fs/gfs2/bmap.c   |   12 +++
>  fs/gfs2/file.c   |  171 +++++++------------------------------------------------
>  fs/gfs2/incore.h |    3 
>  3 files changed, 39 insertions(+), 147 deletions(-)
> 
> Index: gfs2-2.6-nmw/fs/gfs2/file.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> +++ gfs2-2.6-nmw/fs/gfs2/file.c
> @@ -669,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struc
>  	return generic_file_aio_write(iocb, iov, nr_segs, pos);
>  }
>  
> -static int empty_write_end(struct page *page, unsigned from,
> -			   unsigned to, int mode)
> -{
> -	struct inode *inode = page->mapping->host;
> -	struct gfs2_inode *ip = GFS2_I(inode);
> -	struct buffer_head *bh;
> -	unsigned offset, blksize = 1 << inode->i_blkbits;
> -	pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
> -
> -	zero_user(page, from, to-from);
> -	mark_page_accessed(page);
> -
> -	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> -		if (!gfs2_is_writeback(ip))
> -			gfs2_page_add_databufs(ip, page, from, to);
> -
> -		block_commit_write(page, from, to);
> -		return 0;
> -	}
> -
> -	offset = 0;
> -	bh = page_buffers(page);
> -	while (offset < to) {
> -		if (offset >= from) {
> -			set_buffer_uptodate(bh);
> -			mark_buffer_dirty(bh);
> -			clear_buffer_new(bh);
> -			write_dirty_buffer(bh, WRITE);
> -		}
> -		offset += blksize;
> -		bh = bh->b_this_page;
> -	}
> -
> -	offset = 0;
> -	bh = page_buffers(page);
> -	while (offset < to) {
> -		if (offset >= from) {
> -			wait_on_buffer(bh);
> -			if (!buffer_uptodate(bh))
> -				return -EIO;
> -		}
> -		offset += blksize;
> -		bh = bh->b_this_page;
> -	}
> -	return 0;
> -}
> -
> -static int needs_empty_write(sector_t block, struct inode *inode)
> -{
> -	int error;
> -	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
> -
> -	bh_map.b_size = 1 << inode->i_blkbits;
> -	error = gfs2_block_map(inode, block, &bh_map, 0);
> -	if (unlikely(error))
> -		return error;
> -	return !buffer_mapped(&bh_map);
> -}
> -
> -static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> -			      int mode)
> -{
> -	struct inode *inode = page->mapping->host;
> -	unsigned start, end, next, blksize;
> -	sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
> -	int ret;
> -
> -	blksize = 1 << inode->i_blkbits;
> -	next = end = 0;
> -	while (next < from) {
> -		next += blksize;
> -		block++;
> -	}
> -	start = next;
> -	do {
> -		next += blksize;
> -		ret = needs_empty_write(block, inode);
> -		if (unlikely(ret < 0))
> -			return ret;
> -		if (ret == 0) {
> -			if (end) {
> -				ret = __block_write_begin(page, start, end - start,
> -							  gfs2_block_map);
> -				if (unlikely(ret))
> -					return ret;
> -				ret = empty_write_end(page, start, end, mode);
> -				if (unlikely(ret))
> -					return ret;
> -				end = 0;
> -			}
> -			start = next;
> -		}
> -		else
> -			end = next;
> -		block++;
> -	} while (next < to);
> -
> -	if (end) {
> -		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> -		if (unlikely(ret))
> -			return ret;
> -		ret = empty_write_end(page, start, end, mode);
> -		if (unlikely(ret))
> -			return ret;
> -	}
> -
> -	return 0;
> -}
> -
>  static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
>  			   int mode)
>  {
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  	struct buffer_head *dibh;
>  	int error;
> -	u64 start = offset >> PAGE_CACHE_SHIFT;
> -	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
> -	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
> -	pgoff_t curr;
> -	struct page *page;
> -	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
> -	unsigned int from, to;
> -
> -	if (!end_offset)
> -		end_offset = PAGE_CACHE_SIZE;
> +	unsigned int nr_blks;
> +	sector_t lblock = offset >> inode->i_blkbits;
>  
>  	error = gfs2_meta_inode_buffer(ip, &dibh);
>  	if (unlikely(error))
> -		goto out;
> +		return error;
>  
>  	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
>  
> @@ -807,39 +690,31 @@ static int fallocate_chunk(struct inode 
>  			goto out;
>  	}
>  
> -	curr = start;
> -	offset = start << PAGE_CACHE_SHIFT;
> -	from = start_offset;
> -	to = PAGE_CACHE_SIZE;
> -	while (curr <= end) {
> -		page = grab_cache_page_write_begin(inode->i_mapping, curr,
> -						   AOP_FLAG_NOFS);
> -		if (unlikely(!page)) {
> -			error = -ENOMEM;
> -			goto out;
> -		}
> +	while (len) {
> +		struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
> +		bh_map.b_size = len;
> +		set_buffer_zeronew(&bh_map);
>  
> -		if (curr == end)
> -			to = end_offset;
> -		error = write_empty_blocks(page, from, to, mode);
> -		if (!error && offset + to > inode->i_size &&
> -		    !(mode & FALLOC_FL_KEEP_SIZE)) {
> -			i_size_write(inode, offset + to);
> -		}
> -		unlock_page(page);
> -		page_cache_release(page);
> -		if (error)
> +		error = gfs2_block_map(inode, lblock, &bh_map, 1);
> +		if (unlikely(error))
>  			goto out;
> -		curr++;
> -		offset += PAGE_CACHE_SIZE;
> -		from = 0;
> +		len -= bh_map.b_size;
> +		nr_blks = bh_map.b_size >> inode->i_blkbits;
> +		lblock += nr_blks;
> +		if (!buffer_new(&bh_map))
> +			continue;
> +		if (unlikely(!buffer_zeronew(&bh_map))) {
> +			error = -EIO;
> +			goto out;
> +		}
>  	}
> +	if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
> +		i_size_write(inode, offset + len);
>  
>  	mark_inode_dirty(inode);
>  
> -	brelse(dibh);
> -
>  out:
> +	brelse(dibh);
>  	return error;
>  }
>  
> @@ -879,6 +754,7 @@ static long gfs2_fallocate(struct file *
>  	int error;
>  	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
>  	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
> +	loff_t max_chunk_size = UINT_MAX & bsize_mask;
>  	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
>  
>  	/* We only support the FALLOC_FL_KEEP_SIZE mode */
> @@ -932,7 +808,8 @@ retry:
>  			goto out_qunlock;
>  		}
>  		max_bytes = bytes;
> -		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
> +		calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
> +				&max_bytes, &data_blocks, &ind_blocks);
>  		al->al_requested = data_blocks + ind_blocks;
>  
>  		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
> Index: gfs2-2.6-nmw/fs/gfs2/bmap.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/bmap.c
> +++ gfs2-2.6-nmw/fs/gfs2/bmap.c
> @@ -10,6 +10,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/completion.h>
>  #include <linux/buffer_head.h>
> +#include <linux/blkdev.h>
>  #include <linux/gfs2_ondisk.h>
>  #include <linux/crc32.h>
>  
> @@ -427,12 +428,14 @@ static int gfs2_bmap_alloc(struct inode 
>  {
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  	struct gfs2_sbd *sdp = GFS2_SB(inode);
> +	struct super_block *sb = sdp->sd_vfs;
>  	struct buffer_head *dibh = mp->mp_bh[0];
>  	u64 bn, dblock = 0;
>  	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
>  	unsigned dblks = 0;
>  	unsigned ptrs_per_blk;
>  	const unsigned end_of_metadata = height - 1;
> +	int ret;
>  	int eob = 0;
>  	enum alloc_state state;
>  	__be64 *ptr;
> @@ -535,6 +538,15 @@ static int gfs2_bmap_alloc(struct inode 
>  			dblock = bn;
>  			while (n-- > 0)
>  				*ptr++ = cpu_to_be64(bn++);
> +			if (buffer_zeronew(bh_map)) {
> +				ret = sb_issue_zeroout(sb, dblock, dblks,
> +						       GFP_NOFS);
> +				if (ret) {
> +					fs_err(sdp,
> +					       "Failed to zero data buffers\n");
> +					clear_buffer_zeronew(bh_map);
> +				}
> +			}
>  			break;
>  		}
>  	} while ((state != ALLOC_DATA) || !dblock);
> Index: gfs2-2.6-nmw/fs/gfs2/incore.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/incore.h
> +++ gfs2-2.6-nmw/fs/gfs2/incore.h
> @@ -103,12 +103,15 @@ struct gfs2_rgrpd {
>  enum gfs2_state_bits {
>  	BH_Pinned = BH_PrivateStart,
>  	BH_Escaped = BH_PrivateStart + 1,
> +	BH_Zeronew = BH_PrivateStart + 2,
>  };
>  
>  BUFFER_FNS(Pinned, pinned)
>  TAS_BUFFER_FNS(Pinned, pinned)
>  BUFFER_FNS(Escaped, escaped)
>  TAS_BUFFER_FNS(Escaped, escaped)
> +BUFFER_FNS(Zeronew, zeronew)
> +TAS_BUFFER_FNS(Zeronew, zeronew)
>  
>  struct gfs2_bufdata {
>  	struct buffer_head *bd_bh;
> 




^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2011-09-18 13:45 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-09-12 23:15 [Cluster-devel] [PATCH] GFS2: rewrite fallocate code to write blocks directly Benjamin Marzinski
2011-09-18 13:45 ` Steven Whitehouse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).