From mboxrd@z Thu Jan 1 00:00:00 1970 From: Steven Whitehouse Date: Mon, 23 Aug 2010 16:29:39 +0100 Subject: [Cluster-devel] [PATCH] GFS2: fallocate support In-Reply-To: <20100820052102.GO3068@ether.msp.redhat.com> References: <20100820052102.GO3068@ether.msp.redhat.com> Message-ID: <1282577379.2514.68.camel@localhost> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi, Now in the -nmw git tree. Thanks, Steve. On Fri, 2010-08-20 at 00:21 -0500, Benjamin Marzinski wrote: > This patch adds support for fallocate to gfs2. Since the gfs2 does not support > uninitialized data blocks, it must write out zeros to all the blocks. However, > since it does not need to lock any pages to read from, gfs2 can write out the > zero blocks much more efficiently. On a moderately full filesystem, fallocate > works around 5 times faster on average. The fallocate call also allows gfs2 to > add blocks to the file without changing the filesize, which will make it > possible for gfs2 to preallocate space for the rindex file, so that gfs2 can > grow a completely full filesystem. > > Signed-off-by: Benjamin Marzinski > --- > fs/gfs2/aops.c | 4 > fs/gfs2/incore.h | 1 > fs/gfs2/inode.h | 2 > fs/gfs2/ops_inode.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/gfs2/rgrp.c | 12 ++ > fs/gfs2/trans.h | 1 > 6 files changed, 272 insertions(+), 2 deletions(-) > > Index: gfs2-2.6-nmw/fs/gfs2/aops.c > =================================================================== > --- gfs2-2.6-nmw.orig/fs/gfs2/aops.c > +++ gfs2-2.6-nmw/fs/gfs2/aops.c > @@ -36,8 +36,8 @@ > #include "glops.h" > > > -static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, > - unsigned int from, unsigned int to) > +void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, > + unsigned int from, unsigned int to) > { > struct buffer_head *head = page_buffers(page); > unsigned int bsize = head->b_size; > Index: gfs2-2.6-nmw/fs/gfs2/inode.h > =================================================================== > --- gfs2-2.6-nmw.orig/fs/gfs2/inode.h > +++ gfs2-2.6-nmw/fs/gfs2/inode.h > @@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page > extern int gfs2_internal_read(struct gfs2_inode *ip, > struct file_ra_state *ra_state, > char *buf, loff_t *pos, unsigned size); > +extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, > + unsigned int from, unsigned int to); > extern void gfs2_set_aops(struct inode *inode); > > static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) > Index: gfs2-2.6-nmw/fs/gfs2/ops_inode.c > =================================================================== > --- gfs2-2.6-nmw.orig/fs/gfs2/ops_inode.c > +++ gfs2-2.6-nmw/fs/gfs2/ops_inode.c > @@ -18,6 +18,8 @@ > #include > #include > #include > +#include > +#include > #include > > #include "gfs2.h" > @@ -1277,6 +1279,257 @@ static int gfs2_removexattr(struct dentr > return ret; > } > > +static void empty_write_end(struct page *page, unsigned from, > + unsigned to) > +{ > + struct gfs2_inode *ip = GFS2_I(page->mapping->host); > + > + page_zero_new_buffers(page, from, to); > + flush_dcache_page(page); > + mark_page_accessed(page); > + > + if (!gfs2_is_writeback(ip)) > + gfs2_page_add_databufs(ip, page, from, to); > + > + block_commit_write(page, from, to); > +} > + > + > +static int write_empty_blocks(struct page *page, unsigned from, unsigned to) > +{ > + unsigned start, end, next; > + struct buffer_head *bh, *head; > + int error; > + > + if (!page_has_buffers(page)) { > + error = block_prepare_write(page, from, to, gfs2_block_map); > + if (unlikely(error)) > + return error; > + > + empty_write_end(page, from, to); > + return 0; > + } > + > + bh = head = page_buffers(page); > + next = end = 0; > + while (next < from) { > + next += bh->b_size; > + bh = bh->b_this_page; > + } > + start = next; > + do { > + next += bh->b_size; > + if (buffer_mapped(bh)) { > + if (end) { > + error = block_prepare_write(page, start, end, > + gfs2_block_map); > + if (unlikely(error)) > + return error; > + empty_write_end(page, start, end); > + end = 0; > + } > + start = next; > + } > + else > + end = next; > + bh = bh->b_this_page; > + } while (next < to); > + > + if (end) { > + error = block_prepare_write(page, start, end, gfs2_block_map); > + if (unlikely(error)) > + return error; > + empty_write_end(page, start, end); > + } > + > + return 0; > +} > + > +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, > + int mode) > +{ > + struct gfs2_inode *ip = GFS2_I(inode); > + struct buffer_head *dibh; > + int error; > + u64 start = offset >> PAGE_CACHE_SHIFT; > + unsigned int start_offset = offset & ~PAGE_CACHE_MASK; > + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT; > + pgoff_t curr; > + struct page *page; > + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK; > + unsigned int from, to; > + > + if (!end_offset) > + end_offset = PAGE_CACHE_SIZE; > + > + error = gfs2_meta_inode_buffer(ip, &dibh); > + if (unlikely(error)) > + goto out; > + > + gfs2_trans_add_bh(ip->i_gl, dibh, 1); > + > + if (gfs2_is_stuffed(ip)) { > + error = gfs2_unstuff_dinode(ip, NULL); > + if (unlikely(error)) > + goto out; > + } > + > + curr = start; > + offset = start << PAGE_CACHE_SHIFT; > + from = start_offset; > + to = PAGE_CACHE_SIZE; > + while (curr <= end) { > + page = grab_cache_page_write_begin(inode->i_mapping, curr, > + AOP_FLAG_NOFS); > + if (unlikely(!page)) { > + error = -ENOMEM; > + goto out; > + } > + > + if (curr == end) > + to = end_offset; > + error = write_empty_blocks(page, from, to); > + if (!error && offset + to > inode->i_size && > + !(mode & FALLOC_FL_KEEP_SIZE)) { > + i_size_write(inode, offset + to); > + } > + unlock_page(page); > + page_cache_release(page); > + if (error) > + goto out; > + curr++; > + offset += PAGE_CACHE_SIZE; > + from = 0; > + } > + > + gfs2_dinode_out(ip, dibh->b_data); > + mark_inode_dirty(inode); > + > + brelse(dibh); > + > +out: > + return error; > +} > + > +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, > + unsigned int *data_blocks, unsigned int *ind_blocks) > +{ > + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); > + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone; > + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); > + > + for (tmp = max_data; tmp > sdp->sd_diptrs;) { > + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); > + max_data -= tmp; > + } > + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, > + so it might end up with fewer data blocks */ > + if (max_data <= *data_blocks) > + return; > + *data_blocks = max_data; > + *ind_blocks = max_blocks - max_data; > + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; > + if (*len > max) { > + *len = max; > + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); > + } > +} > + > +static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset, > + loff_t len) > +{ > + struct gfs2_sbd *sdp = GFS2_SB(inode); > + struct gfs2_inode *ip = GFS2_I(inode); > + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; > + loff_t bytes, max_bytes; > + struct gfs2_alloc *al; > + int error; > + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; > + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; > + > + offset = (offset >> sdp->sd_sb.sb_bsize_shift) << > + sdp->sd_sb.sb_bsize_shift; > + > + len = next - offset; > + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; > + if (!bytes) > + bytes = UINT_MAX; > + > + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); > + error = gfs2_glock_nq(&ip->i_gh); > + if (unlikely(error)) > + goto out_uninit; > + > + if (!gfs2_write_alloc_required(ip, offset, len)) > + goto out_unlock; > + > + while (len > 0) { > + if (len < bytes) > + bytes = len; > + al = gfs2_alloc_get(ip); > + if (!al) { > + error = -ENOMEM; > + goto out_unlock; > + } > + > + error = gfs2_quota_lock_check(ip); > + if (error) > + goto out_alloc_put; > + > +retry: > + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); > + > + al->al_requested = data_blocks + ind_blocks; > + error = gfs2_inplace_reserve(ip); > + if (error) { > + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { > + bytes >>= 1; > + goto retry; > + } > + goto out_qunlock; > + } > + max_bytes = bytes; > + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks); > + al->al_requested = data_blocks + ind_blocks; > + > + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + > + RES_RG_HDR + ip->i_alloc->al_rgd->rd_length; > + if (gfs2_is_jdata(ip)) > + rblocks += data_blocks ? data_blocks : 1; > + > + error = gfs2_trans_begin(sdp, rblocks, > + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); > + if (error) > + goto out_trans_fail; > + > + error = fallocate_chunk(inode, offset, max_bytes, mode); > + gfs2_trans_end(sdp); > + > + if (error) > + goto out_trans_fail; > + > + len -= max_bytes; > + offset += max_bytes; > + gfs2_inplace_release(ip); > + gfs2_quota_unlock(ip); > + gfs2_alloc_put(ip); > + } > + goto out_unlock; > + > +out_trans_fail: > + gfs2_inplace_release(ip); > +out_qunlock: > + gfs2_quota_unlock(ip); > +out_alloc_put: > + gfs2_alloc_put(ip); > +out_unlock: > + gfs2_glock_dq(&ip->i_gh); > +out_uninit: > + gfs2_holder_uninit(&ip->i_gh); > + return error; > +} > + > + > static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, > u64 start, u64 len) > { > @@ -1327,6 +1580,7 @@ const struct inode_operations gfs2_file_ > .getxattr = gfs2_getxattr, > .listxattr = gfs2_listxattr, > .removexattr = gfs2_removexattr, > + .fallocate = gfs2_fallocate, > .fiemap = gfs2_fiemap, > }; > > Index: gfs2-2.6-nmw/fs/gfs2/incore.h > =================================================================== > --- gfs2-2.6-nmw.orig/fs/gfs2/incore.h > +++ gfs2-2.6-nmw/fs/gfs2/incore.h > @@ -571,6 +571,7 @@ struct gfs2_sbd { > struct list_head sd_rindex_mru_list; > struct gfs2_rgrpd *sd_rindex_forward; > unsigned int sd_rgrps; > + unsigned int sd_max_rg_data; > > /* Journal index stuff */ > > Index: gfs2-2.6-nmw/fs/gfs2/rgrp.c > =================================================================== > --- gfs2-2.6-nmw.orig/fs/gfs2/rgrp.c > +++ gfs2-2.6-nmw/fs/gfs2/rgrp.c > @@ -589,6 +589,8 @@ static int gfs2_ri_update(struct gfs2_in > struct inode *inode = &ip->i_inode; > struct file_ra_state ra_state; > u64 rgrp_count = i_size_read(inode); > + struct gfs2_rgrpd *rgd; > + unsigned int max_data = 0; > int error; > > do_div(rgrp_count, sizeof(struct gfs2_rindex)); > @@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_in > } > } > > + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) > + if (rgd->rd_data > max_data) > + max_data = rgd->rd_data; > + sdp->sd_max_rg_data = max_data; > sdp->sd_rindex_uptodate = 1; > return 0; > } > @@ -622,6 +628,8 @@ static int gfs2_ri_update_special(struct > struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); > struct inode *inode = &ip->i_inode; > struct file_ra_state ra_state; > + struct gfs2_rgrpd *rgd; > + unsigned int max_data = 0; > int error; > > file_ra_state_init(&ra_state, inode->i_mapping); > @@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct > return error; > } > } > + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list) > + if (rgd->rd_data > max_data) > + max_data = rgd->rd_data; > + sdp->sd_max_rg_data = max_data; > > sdp->sd_rindex_uptodate = 1; > return 0; > Index: gfs2-2.6-nmw/fs/gfs2/trans.h > =================================================================== > --- gfs2-2.6-nmw.orig/fs/gfs2/trans.h > +++ gfs2-2.6-nmw/fs/gfs2/trans.h > @@ -20,6 +20,7 @@ struct gfs2_glock; > #define RES_JDATA 1 > #define RES_DATA 1 > #define RES_LEAF 1 > +#define RES_RG_HDR 1 > #define RES_RG_BIT 2 > #define RES_EATTR 1 > #define RES_STATFS 1 >