From mboxrd@z Thu Jan 1 00:00:00 1970 From: Steven Whitehouse Date: Sat, 31 Mar 2007 16:34:19 +0100 Subject: [Cluster-devel] [GFS2] [RFC] Patch for write_begin/write_end Message-ID: <1175355259.1636.186.camel@quoit.chygwyn.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi, Nick Piggin has proposed a series of patches to the VFS which change the VFS interface from prepare_write/commit_write to write_begin/write_end. This has the great advantage to GFS2 that we no longer need to deal with the AOP_TRUNCATED_PAGE complications since we are then able to lock the glock and the page lock in the correct order. The filesystem becomes responsible for looking up the page, which is an extra complication, but one that seems well worth paying for. This should reduce lock bounce during contention with the alternate test for example and result in better throughput. Important Note: This _doesn't_ apply (like the usual patches) directly to the -nmw git tree. If you want to test this, then you need to start with the -nmw tree, then apply Nick's patches: http://www.kernel.org/pub/linux/kernel/people/npiggin/patches/new-aops/ To that you'll then need to apply Mark Fasheh's fixes to export a couple of functions and a bug in simple_write_begin which are entitled: "Re: Announce: new-aops-1 for 2.6.21-rc3" and were posted on linux-fsdevel and cluster-devel on the 15th Mar. After all that then you can apply the following patch. Its only had the lightest of testing so far, but it does seem to work and survives postmark. The plan is that this will not take the "normal" way into the kernel for GFS2 patches, but will eventually be submitted along with Nick's patches in a batch, probably after a period in -mm. As a result of that patch series, support for AOP_TRUNCATED_PAGE for buffered writes will no longer be needed, and thus removed. It may be a little way off yet, but this is a heads up in the mean time, Steve. diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 90c2879..c15de73 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -337,45 +338,49 @@ out_unlock: } /** - * gfs2_prepare_write - Prepare to write a page to a file + * gfs2_write_begin - Begin to write to a file * @file: The file to write to - * @page: The page which is to be prepared for writing - * @from: From (byte range within page) - * @to: To (byte range within page) + * @mapping: The mapping in which to write + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused by GFS2) * * Returns: errno */ -static int gfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int gfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); unsigned int data_blocks, ind_blocks, rblocks; int alloc_required; int error = 0; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from; - loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; struct gfs2_alloc *al; - unsigned int write_len = to - from; - + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct page *page; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); error = gfs2_glock_nq_atime(&ip->i_gh); - if (unlikely(error)) { - if (error == GLR_TRYFAILED) { - unlock_page(page); - error = AOP_TRUNCATED_PAGE; - yield(); - } + if (unlikely(error)) goto out_uninit; - } - gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks); + error = -ENOMEM; + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + goto out_unlock; + + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); - error = gfs2_write_alloc_required(ip, pos, write_len, &alloc_required); + error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) - goto out_unlock; + goto out_putpage; ip->i_alloc.al_requested = 0; @@ -407,7 +412,7 @@ static int gfs2_prepare_write(struct file *file, struct page *page, goto out; if (gfs2_is_stuffed(ip)) { - if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { error = gfs2_unstuff_dinode(ip, page); if (error == 0) goto prepare_write; @@ -429,6 +434,10 @@ out_qunlock: out_alloc_put: gfs2_alloc_put(ip); } +out_putpage: + page_cache_release(page); + if (pos + len > ip->i_inode.i_size) + vmtruncate(&ip->i_inode, ip->i_inode.i_size); out_unlock: gfs2_glock_dq_m(1, &ip->i_gh); out_uninit: @@ -439,96 +448,128 @@ out_uninit: } /** - * gfs2_commit_write - Commit write to a file + * gfs2_stuffed_write_end - Write end for stuffed files + * @inode: The inode + * @dibh: The buffer_head containing the on-disk inode + * @pos: The file position + * @len: The length of the write + * @copied: How much was actually copied by the VFS + * @page: The page + * + * This copies the data from the page into the inode block after + * the inode data structure itself. + * + * Returns: errno + */ +static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 to = pos + copied; + void *kaddr; + unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + + BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buf + pos, kaddr + pos, copied); + memset(kaddr + pos + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + if (inode->i_size < to) { + i_size_write(inode, to); + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); + mark_inode_dirty(inode); + } + + brelse(dibh); + gfs2_trans_end(sdp); + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return copied; +} + +/** + * gfs2_write_end * @file: The file to write to - * @page: The page containing the data - * @from: From (byte range within page) - * @to: To (byte range within page) + * @mapping: The address space to write to + * @pos: The file position + * @len: The length of the data + * @copied: + * @page: The page that has been written + * @fsdata: The fsdata (unused in GFS2) + * + * The main write_end function for GFS2. We have a separate one for + * stuffed files as they are slightly different, otherwise we just + * put our locking around the VFS provided functions. * * Returns: errno */ -static int gfs2_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int gfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int error = -EOPNOTSUPP; struct buffer_head *dibh; struct gfs2_alloc *al = &ip->i_alloc; struct gfs2_dinode *di; + unsigned int from = pos & (PAGE_CACHE_SIZE - 1); + unsigned int to = from + len; + int ret; - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl))) - goto fail_nounlock; + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0); - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_endtrans; + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(ret)) { + unlock_page(page); + page_cache_release(page); + goto failed; + } gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di = (struct gfs2_dinode *)dibh->b_data; - - if (gfs2_is_stuffed(ip)) { - u64 file_size; - void *kaddr; - file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to; + if (gfs2_is_stuffed(ip)) + return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); - kaddr = kmap_atomic(page, KM_USER0); - memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from, - kaddr + from, to - from); - kunmap_atomic(kaddr, KM_USER0); + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + gfs2_page_add_databufs(ip, page, from, to); - SetPageUptodate(page); + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - if (inode->i_size < file_size) { - i_size_write(inode, file_size); + if (likely(ret >= 0)) { + copied = ret; + if ((pos + copied) > inode->i_size) { + di = (struct gfs2_dinode *)dibh->b_data; + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); mark_inode_dirty(inode); } - } else { - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || - gfs2_is_jdata(ip)) - gfs2_page_add_databufs(ip, page, from, to); - error = generic_commit_write(file, page, from, to); - if (error) - goto fail; - } - - if (ip->i_di.di_size < inode->i_size) { - ip->i_di.di_size = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); - } - - brelse(dibh); - gfs2_trans_end(sdp); - if (al->al_requested) { - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); } - unlock_page(page); - gfs2_glock_dq_m(1, &ip->i_gh); - lock_page(page); - gfs2_holder_uninit(&ip->i_gh); - return 0; -fail: brelse(dibh); -fail_endtrans: gfs2_trans_end(sdp); +failed: if (al->al_requested) { gfs2_inplace_release(ip); gfs2_quota_unlock(ip); gfs2_alloc_put(ip); } - unlock_page(page); - gfs2_glock_dq_m(1, &ip->i_gh); - lock_page(page); + gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); -fail_nounlock: - ClearPageUptodate(page); - return error; + return ret; } /** @@ -797,8 +838,8 @@ const struct address_space_operations gfs2_file_aops = { .readpage = gfs2_readpage, .readpages = gfs2_readpages, .sync_page = block_sync_page, - .prepare_write = gfs2_prepare_write, - .commit_write = gfs2_commit_write, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage,