cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
* [Cluster-devel] [PATCH] GFS2: directly write blocks past i_size
@ 2011-03-17  5:08 Benjamin Marzinski
  2011-03-17  9:26 ` Steven Whitehouse
  0 siblings, 1 reply; 4+ messages in thread
From: Benjamin Marzinski @ 2011-03-17  5:08 UTC (permalink / raw)
  To: cluster-devel.redhat.com

GFS2 was relying on the writepage code to write out the zeroed data for
fallocate.  However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
If it is, it will be ignored.  To work around this, gfs2 now calls
write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
is set, and it's writing past i_size.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
---
 fs/gfs2/file.c |   62 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 52 insertions(+), 10 deletions(-)

Index: gfs2-2.6-nmw/fs/gfs2/file.c
===================================================================
--- gfs2-2.6-nmw.orig/fs/gfs2/file.c
+++ gfs2-2.6-nmw/fs/gfs2/file.c
@@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
 	return generic_file_aio_write(iocb, iov, nr_segs, pos);
 }
 
-static void empty_write_end(struct page *page, unsigned from,
-			   unsigned to)
+static int empty_write_end(struct page *page, unsigned from,
+			   unsigned to, int mode)
 {
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh;
+	int waiting = 0;
+	unsigned offset, blksize = 1 << inode->i_blkbits;
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 
 	zero_user(page, from, to-from);
 	mark_page_accessed(page);
 
-	if (!gfs2_is_writeback(ip))
-		gfs2_page_add_databufs(ip, page, from, to);
+ 	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
+		if (!gfs2_is_writeback(ip))
+			gfs2_page_add_databufs(ip, page, from, to);
 
-	block_commit_write(page, from, to);
+		block_commit_write(page, from, to);
+		return 0;
+	}
+
+second_pass:
+	offset = 0;
+	bh = page_buffers(page);
+	while (offset < from) {
+		offset += blksize;
+		bh = bh->b_this_page;
+	}
+	while (offset < to) {
+		if (waiting){
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				return -EIO;
+		}
+		else {
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+			clear_buffer_new(bh);
+			write_dirty_buffer(bh, WRITE);
+		}
+		offset += blksize;
+		bh = bh->b_this_page;
+	}
+	if (!waiting) {
+		waiting = 1;
+		goto second_pass;
+	}
+	return 0;
 }
 
 static int needs_empty_write(sector_t block, struct inode *inode)
@@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
 	return !buffer_mapped(&bh_map);
 }
 
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
+			      int mode)
 {
 	struct inode *inode = page->mapping->host;
 	unsigned start, end, next, blksize;
@@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
 							  gfs2_block_map);
 				if (unlikely(ret))
 					return ret;
-				empty_write_end(page, start, end);
+				ret = empty_write_end(page, start, end, mode);
+				if (unlikely(ret))
+					return ret;
 				end = 0;
 			}
 			start = next;
@@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
 		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
 		if (unlikely(ret))
 			return ret;
-		empty_write_end(page, start, end);
+		ret = empty_write_end(page, start, end, mode);
+		if (unlikely(ret))
+			return ret;
 	}
 
 	return 0;
@@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode 
 
 		if (curr == end)
 			to = end_offset;
-		error = write_empty_blocks(page, from, to);
+		error = write_empty_blocks(page, from, to, mode);
 		if (!error && offset + to > inode->i_size &&
 		    !(mode & FALLOC_FL_KEEP_SIZE)) {
 			i_size_write(inode, offset + to);



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [PATCH] GFS2: directly write blocks past i_size
  2011-03-17  5:08 [Cluster-devel] [PATCH] GFS2: directly write blocks past i_size Benjamin Marzinski
@ 2011-03-17  9:26 ` Steven Whitehouse
  2011-03-17 14:05   ` Benjamin Marzinski
  2011-03-18  2:53   ` Benjamin Marzinski
  0 siblings, 2 replies; 4+ messages in thread
From: Steven Whitehouse @ 2011-03-17  9:26 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On Thu, 2011-03-17 at 00:08 -0500, Benjamin Marzinski wrote:
> GFS2 was relying on the writepage code to write out the zeroed data for
> fallocate.  However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
> If it is, it will be ignored.  To work around this, gfs2 now calls
> write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
> is set, and it's writing past i_size.
> 
> Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> ---
>  fs/gfs2/file.c |   62 +++++++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 52 insertions(+), 10 deletions(-)
> 
Generally looks good, but a few minor points....

> Index: gfs2-2.6-nmw/fs/gfs2/file.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> +++ gfs2-2.6-nmw/fs/gfs2/file.c
> @@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
>  	return generic_file_aio_write(iocb, iov, nr_segs, pos);
>  }
>  
> -static void empty_write_end(struct page *page, unsigned from,
> -			   unsigned to)
> +static int empty_write_end(struct page *page, unsigned from,
> +			   unsigned to, int mode)
>  {
> -	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> +	struct inode *inode = page->mapping->host;
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct buffer_head *bh;
> +	int waiting = 0;
> +	unsigned offset, blksize = 1 << inode->i_blkbits;
> +	loff_t i_size = i_size_read(inode);
> +	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
>  
>  	zero_user(page, from, to-from);
>  	mark_page_accessed(page);
>  
> -	if (!gfs2_is_writeback(ip))
> -		gfs2_page_add_databufs(ip, page, from, to);
> + 	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> +		if (!gfs2_is_writeback(ip))
> +			gfs2_page_add_databufs(ip, page, from, to);
>  
> -	block_commit_write(page, from, to);
> +		block_commit_write(page, from, to);
> +		return 0;
> +	}
> +
> +second_pass:
> +	offset = 0;
> +	bh = page_buffers(page);
> +	while (offset < from) {
> +		offset += blksize;
> +		bh = bh->b_this_page;
> +	}
> +	while (offset < to) {
> +		if (waiting){
> +			wait_on_buffer(bh);
> +			if (!buffer_uptodate(bh))
> +				return -EIO;
> +		}
> +		else {
The else should start on the line above to make it more obvious.

> +			set_buffer_uptodate(bh);
> +			mark_buffer_dirty(bh);
> +			clear_buffer_new(bh);
> +			write_dirty_buffer(bh, WRITE);
Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?

> +		}
> +		offset += blksize;
> +		bh = bh->b_this_page;
> +	}
> +	if (!waiting) {
> +		waiting = 1;
> +		goto second_pass;
> +	}
I think the code might be a bit cleaner if it was just written as two
loops, one after the other since most of the loop content seems to be
different according to weather "waiting" is set or not.

Otherwise I think this is a good solution,

Steve.

> +	return 0;
>  }
>  
>  static int needs_empty_write(sector_t block, struct inode *inode)
> @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
>  	return !buffer_mapped(&bh_map);
>  }
>  
> -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> +			      int mode)
>  {
>  	struct inode *inode = page->mapping->host;
>  	unsigned start, end, next, blksize;
> @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
>  							  gfs2_block_map);
>  				if (unlikely(ret))
>  					return ret;
> -				empty_write_end(page, start, end);
> +				ret = empty_write_end(page, start, end, mode);
> +				if (unlikely(ret))
> +					return ret;
>  				end = 0;
>  			}
>  			start = next;
> @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
>  		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
>  		if (unlikely(ret))
>  			return ret;
> -		empty_write_end(page, start, end);
> +		ret = empty_write_end(page, start, end, mode);
> +		if (unlikely(ret))
> +			return ret;
>  	}
>  
>  	return 0;
> @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode 
>  
>  		if (curr == end)
>  			to = end_offset;
> -		error = write_empty_blocks(page, from, to);
> +		error = write_empty_blocks(page, from, to, mode);
>  		if (!error && offset + to > inode->i_size &&
>  		    !(mode & FALLOC_FL_KEEP_SIZE)) {
>  			i_size_write(inode, offset + to);
> 




^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [PATCH] GFS2: directly write blocks past i_size
  2011-03-17  9:26 ` Steven Whitehouse
@ 2011-03-17 14:05   ` Benjamin Marzinski
  2011-03-18  2:53   ` Benjamin Marzinski
  1 sibling, 0 replies; 4+ messages in thread
From: Benjamin Marzinski @ 2011-03-17 14:05 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On Thu, Mar 17, 2011 at 09:26:08AM +0000, Steven Whitehouse wrote:
> Hi,
> 
> On Thu, 2011-03-17 at 00:08 -0500, Benjamin Marzinski wrote:
> > GFS2 was relying on the writepage code to write out the zeroed data for
> > fallocate.  However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
> > If it is, it will be ignored.  To work around this, gfs2 now calls
> > write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
> > is set, and it's writing past i_size.
> > 
> > Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
> > ---
> >  fs/gfs2/file.c |   62 +++++++++++++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 52 insertions(+), 10 deletions(-)
> > 
> Generally looks good, but a few minor points....
> 
> > Index: gfs2-2.6-nmw/fs/gfs2/file.c
> > ===================================================================
> > --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> > +++ gfs2-2.6-nmw/fs/gfs2/file.c
> > @@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
> >  	return generic_file_aio_write(iocb, iov, nr_segs, pos);
> >  }
> >  
> > -static void empty_write_end(struct page *page, unsigned from,
> > -			   unsigned to)
> > +static int empty_write_end(struct page *page, unsigned from,
> > +			   unsigned to, int mode)
> >  {
> > -	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> > +	struct inode *inode = page->mapping->host;
> > +	struct gfs2_inode *ip = GFS2_I(inode);
> > +	struct buffer_head *bh;
> > +	int waiting = 0;
> > +	unsigned offset, blksize = 1 << inode->i_blkbits;
> > +	loff_t i_size = i_size_read(inode);
> > +	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> >  
> >  	zero_user(page, from, to-from);
> >  	mark_page_accessed(page);
> >  
> > -	if (!gfs2_is_writeback(ip))
> > -		gfs2_page_add_databufs(ip, page, from, to);
> > + 	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> > +		if (!gfs2_is_writeback(ip))
> > +			gfs2_page_add_databufs(ip, page, from, to);
> >  
> > -	block_commit_write(page, from, to);
> > +		block_commit_write(page, from, to);
> > +		return 0;
> > +	}
> > +
> > +second_pass:
> > +	offset = 0;
> > +	bh = page_buffers(page);
> > +	while (offset < from) {
> > +		offset += blksize;
> > +		bh = bh->b_this_page;
> > +	}
> > +	while (offset < to) {
> > +		if (waiting){
> > +			wait_on_buffer(bh);
> > +			if (!buffer_uptodate(bh))
> > +				return -EIO;
> > +		}
> > +		else {
> The else should start on the line above to make it more obvious.
> 
> > +			set_buffer_uptodate(bh);
> > +			mark_buffer_dirty(bh);
> > +			clear_buffer_new(bh);
> > +			write_dirty_buffer(bh, WRITE);
> Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?
> 
> > +		}
> > +		offset += blksize;
> > +		bh = bh->b_this_page;
> > +	}
> > +	if (!waiting) {
> > +		waiting = 1;
> > +		goto second_pass;
> > +	}
> I think the code might be a bit cleaner if it was just written as two
> loops, one after the other since most of the loop content seems to be
> different according to weather "waiting" is set or not.
> 
> Otherwise I think this is a good solution,
> 
> Steve.
> 


Sure.  This patch kept evolving, and I probably should have given it a
better cleanup once I was satisfied that it worked.  I send off a new
version.

-Ben

> > +	return 0;
> >  }
> >  
> >  static int needs_empty_write(sector_t block, struct inode *inode)
> > @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
> >  	return !buffer_mapped(&bh_map);
> >  }
> >  
> > -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> > +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> > +			      int mode)
> >  {
> >  	struct inode *inode = page->mapping->host;
> >  	unsigned start, end, next, blksize;
> > @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
> >  							  gfs2_block_map);
> >  				if (unlikely(ret))
> >  					return ret;
> > -				empty_write_end(page, start, end);
> > +				ret = empty_write_end(page, start, end, mode);
> > +				if (unlikely(ret))
> > +					return ret;
> >  				end = 0;
> >  			}
> >  			start = next;
> > @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
> >  		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> >  		if (unlikely(ret))
> >  			return ret;
> > -		empty_write_end(page, start, end);
> > +		ret = empty_write_end(page, start, end, mode);
> > +		if (unlikely(ret))
> > +			return ret;
> >  	}
> >  
> >  	return 0;
> > @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode 
> >  
> >  		if (curr == end)
> >  			to = end_offset;
> > -		error = write_empty_blocks(page, from, to);
> > +		error = write_empty_blocks(page, from, to, mode);
> >  		if (!error && offset + to > inode->i_size &&
> >  		    !(mode & FALLOC_FL_KEEP_SIZE)) {
> >  			i_size_write(inode, offset + to);
> > 
> 



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [PATCH] GFS2: directly write blocks past i_size
  2011-03-17  9:26 ` Steven Whitehouse
  2011-03-17 14:05   ` Benjamin Marzinski
@ 2011-03-18  2:53   ` Benjamin Marzinski
  1 sibling, 0 replies; 4+ messages in thread
From: Benjamin Marzinski @ 2011-03-18  2:53 UTC (permalink / raw)
  To: cluster-devel.redhat.com

> Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?
> 

I tried these, but they didn't cause any improvement.  That doesn't seem
terribly surprising, since they both set NOIDLE, which tells the cfq
scheduler not to wait for more IO from this process.  I event tried
sending all the buffers except the last with WRITE_ODIRECT_PLUG, and
then the last buffer from each page with WRITE_SYNC, which seems like it
should work better, but in all the tests I did, it didn't appear to make
any difference.  I'm going to come back to this code after the release,
and see if I can find any way to make back some of the lost performance. 

As it is right now, fallocates that grow i_size are doing better than
double the speed of dd. fallocates that go past i_size are an order of
magnitude slower than dd.  But at least they work correctly.

> > +		}
> > +		offset += blksize;
> > +		bh = bh->b_this_page;
> > +	}
> > +	if (!waiting) {
> > +		waiting = 1;
> > +		goto second_pass;
> > +	}
> I think the code might be a bit cleaner if it was just written as two
> loops, one after the other since most of the loop content seems to be
> different according to weather "waiting" is set or not.
> 
> Otherwise I think this is a good solution,
> 
> Steve.
> 
> > +	return 0;
> >  }
> >  
> >  static int needs_empty_write(sector_t block, struct inode *inode)
> > @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
> >  	return !buffer_mapped(&bh_map);
> >  }
> >  
> > -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> > +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> > +			      int mode)
> >  {
> >  	struct inode *inode = page->mapping->host;
> >  	unsigned start, end, next, blksize;
> > @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
> >  							  gfs2_block_map);
> >  				if (unlikely(ret))
> >  					return ret;
> > -				empty_write_end(page, start, end);
> > +				ret = empty_write_end(page, start, end, mode);
> > +				if (unlikely(ret))
> > +					return ret;
> >  				end = 0;
> >  			}
> >  			start = next;
> > @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
> >  		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> >  		if (unlikely(ret))
> >  			return ret;
> > -		empty_write_end(page, start, end);
> > +		ret = empty_write_end(page, start, end, mode);
> > +		if (unlikely(ret))
> > +			return ret;
> >  	}
> >  
> >  	return 0;
> > @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode 
> >  
> >  		if (curr == end)
> >  			to = end_offset;
> > -		error = write_empty_blocks(page, from, to);
> > +		error = write_empty_blocks(page, from, to, mode);
> >  		if (!error && offset + to > inode->i_size &&
> >  		    !(mode & FALLOC_FL_KEEP_SIZE)) {
> >  			i_size_write(inode, offset + to);
> > 
> 



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-03-18  2:53 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-03-17  5:08 [Cluster-devel] [PATCH] GFS2: directly write blocks past i_size Benjamin Marzinski
2011-03-17  9:26 ` Steven Whitehouse
2011-03-17 14:05   ` Benjamin Marzinski
2011-03-18  2:53   ` Benjamin Marzinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).