* [Cluster-devel] [PATCH 0/2] Extended attribute readahead @ 2015-11-01 19:02 Andreas Gruenbacher 2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher 2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher 0 siblings, 2 replies; 13+ messages in thread From: Andreas Gruenbacher @ 2015-11-01 19:02 UTC (permalink / raw) To: cluster-devel.redhat.com Hello, here are two patches that implement extended attribute readahead for those cases where the extended attribute block is allocated immediately upon inode creation (i.e., with SELinux or inherited permissions). The first patch submits a READA request for the extended attributes after submitting a READ request for the inode. The second patch submits both requests in a single bio instead. As you can see, this is somewhat messy. I'm not convinced that this optimization buys us anything, so I will not recommend merging this patch. Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/agruen/linux.git gfs2-wip Could you please review? Thanks, Andreas Andreas Gruenbacher (2): gfs2: Extended attribute readahead gfs2: Extended attribute readahead optimization fs/gfs2/dir.c | 15 ++++++--- fs/gfs2/incore.h | 1 + fs/gfs2/meta_io.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/gfs2/meta_io.h | 2 +- fs/gfs2/quota.c | 2 +- fs/gfs2/rgrp.c | 2 +- fs/gfs2/super.c | 1 + fs/gfs2/xattr.c | 10 +++--- 8 files changed, 109 insertions(+), 19 deletions(-) -- 2.5.0 ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead 2015-11-01 19:02 [Cluster-devel] [PATCH 0/2] Extended attribute readahead Andreas Gruenbacher @ 2015-11-01 19:02 ` Andreas Gruenbacher 2015-11-03 17:29 ` Bob Peterson 2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher 1 sibling, 1 reply; 13+ messages in thread From: Andreas Gruenbacher @ 2015-11-01 19:02 UTC (permalink / raw) To: cluster-devel.redhat.com When gfs2 allocates an inode and its extended attribute block next to each other at inode create time, the inode's directory entry indicates that in de_rahead. In that case, we can readahead the extended attribute block when we read in the inode. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/dir.c | 15 +++++++++++---- fs/gfs2/incore.h | 1 + fs/gfs2/meta_io.c | 27 +++++++++++++++++++++++++-- fs/gfs2/meta_io.h | 2 +- fs/gfs2/quota.c | 2 +- fs/gfs2/rgrp.c | 2 +- fs/gfs2/super.c | 1 + fs/gfs2/xattr.c | 10 +++++----- 8 files changed, 46 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 487527b..b377bdc 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -108,7 +108,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; int error; - error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh); if (error) return error; if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { @@ -305,7 +305,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, BUG_ON(extlen < 1); bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { - error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh); if (error) goto fail; } @@ -718,7 +718,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, { int error; - error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; @@ -1555,15 +1555,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { + struct inode *inode; + u16 rahead; + if (IS_ERR(dent)) return ERR_CAST(dent); dtype = be16_to_cpu(dent->de_type); + rahead = be16_to_cpu(dent->de_rahead); addr = be64_to_cpu(dent->de_inum.no_addr); formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); brelse(bh); if (fail_on_exist) return ERR_PTR(-EEXIST); - return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + if (!IS_ERR(inode)) + GFS2_I(inode)->i_rahead = rahead; + return inode; } return ERR_PTR(-ENOENT); } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 121ed08..722bf69 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -403,6 +403,7 @@ struct gfs2_inode { u32 i_diskflags; u8 i_height; u8 i_depth; + u16 i_rahead; }; /* diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0e1d4be..0f24828 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,6 +187,21 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } +static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) +{ + struct buffer_head *bh; + + bh = gfs2_getbuf(gl, blkno, 1); + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + return; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(READA | REQ_META | REQ_PRIO, bh); +} + /** * gfs2_meta_read - Read a block from disk * @gl: The glock covering the block @@ -198,7 +213,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) */ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp) + int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct buffer_head *bh; @@ -213,11 +228,15 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); + if (rahead) + gfs2_meta_readahead(gl, blkno + 1); return 0; } bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); + if (rahead) + gfs2_meta_readahead(gl, blkno + 1); if (!(flags & DIO_WAIT)) return 0; @@ -341,8 +360,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head *bh; int ret = 0; u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; + int rahead = 0; + + if (num == ip->i_no_addr) + rahead = ip->i_rahead; - ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh); if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 8ca1615..c5086c8 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp); + int rahead, struct buffer_head **bhp); extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3a31226..e01298d 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd) error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; - error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh); if (error) goto fail; error = -EIO; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 475985d..a2a6075 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1157,7 +1157,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; - error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh); if (error) goto fail; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 894fb01..8f94282 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1633,6 +1633,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) ip->i_gl = NULL; ip->i_rgd = NULL; ip->i_res = NULL; + ip->i_rahead = 0; } return &ip->i_inode; } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 4c096fa..f0fe884 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh); if (error) return error; @@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) break; bn = be64_to_cpu(*eablk); - error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh); + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh); if (error) break; error = ea_foreach_i(ip, eabh, ea_call, data); @@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return -ENOMEM; for (x = 0; x < nptrs; x++) { - error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0, bh + x); if (error) { while (x--) @@ -977,7 +977,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; @@ -1303,7 +1303,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; -- 2.5.0 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead 2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher @ 2015-11-03 17:29 ` Bob Peterson 2015-11-03 19:02 ` Steven Whitehouse 0 siblings, 1 reply; 13+ messages in thread From: Bob Peterson @ 2015-11-03 17:29 UTC (permalink / raw) To: cluster-devel.redhat.com ----- Original Message ----- > When gfs2 allocates an inode and its extended attribute block next to > each other at inode create time, the inode's directory entry indicates > that in de_rahead. In that case, we can readahead the extended > attribute block when we read in the inode. > > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> > --- > fs/gfs2/dir.c | 15 +++++++++++---- > fs/gfs2/incore.h | 1 + > fs/gfs2/meta_io.c | 27 +++++++++++++++++++++++++-- > fs/gfs2/meta_io.h | 2 +- > fs/gfs2/quota.c | 2 +- > fs/gfs2/rgrp.c | 2 +- > fs/gfs2/super.c | 1 + > fs/gfs2/xattr.c | 10 +++++----- > 8 files changed, 46 insertions(+), 14 deletions(-) > Hi Andreas, Most of this looks good. However, two comments: 1. I don't like adding a new u16 to the gfs2_inode. I've been working to reduce the size of gfs2's inodes lately, so I'd rather see this implemented as a new GIF_RAHEAD (or similar) flag in gfs2_inode's i_flags. 2. It seems to me like we should take advantage of function gfs2_meta_ra() which already submits one block and a variable number of additional blocks for read-ahead, then waits for the first block IO to complete. Regards, Bob Peterson Red Hat File Systems ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead 2015-11-03 17:29 ` Bob Peterson @ 2015-11-03 19:02 ` Steven Whitehouse 2015-11-03 20:18 ` Andreas Gruenbacher 0 siblings, 1 reply; 13+ messages in thread From: Steven Whitehouse @ 2015-11-03 19:02 UTC (permalink / raw) To: cluster-devel.redhat.com Hi, On 03/11/15 17:29, Bob Peterson wrote: > ----- Original Message ----- >> When gfs2 allocates an inode and its extended attribute block next to >> each other at inode create time, the inode's directory entry indicates >> that in de_rahead. In that case, we can readahead the extended >> attribute block when we read in the inode. >> >> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> >> --- >> fs/gfs2/dir.c | 15 +++++++++++---- >> fs/gfs2/incore.h | 1 + >> fs/gfs2/meta_io.c | 27 +++++++++++++++++++++++++-- >> fs/gfs2/meta_io.h | 2 +- >> fs/gfs2/quota.c | 2 +- >> fs/gfs2/rgrp.c | 2 +- >> fs/gfs2/super.c | 1 + >> fs/gfs2/xattr.c | 10 +++++----- >> 8 files changed, 46 insertions(+), 14 deletions(-) >> > Hi Andreas, > > Most of this looks good. However, two comments: > > 1. I don't like adding a new u16 to the gfs2_inode. I've been working to > reduce the size of gfs2's inodes lately, so I'd rather see this > implemented as a new GIF_RAHEAD (or similar) flag in gfs2_inode's i_flags. > 2. It seems to me like we should take advantage of function gfs2_meta_ra() > which already submits one block and a variable number of additional > blocks for read-ahead, then waits for the first block IO to complete. The meta_ra thing is a bit of a hack, and best avoided for this use. We want to only send out a single I/O here rather than let the I/O stack do the merging after the fact, Steve. > Regards, > > Bob Peterson > Red Hat File Systems ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 1/2] gfs2: Extended attribute readahead 2015-11-03 19:02 ` Steven Whitehouse @ 2015-11-03 20:18 ` Andreas Gruenbacher 0 siblings, 0 replies; 13+ messages in thread From: Andreas Gruenbacher @ 2015-11-03 20:18 UTC (permalink / raw) To: cluster-devel.redhat.com Bob and Steve, On Tue, Nov 3, 2015 at 8:02 PM, Steven Whitehouse <swhiteho@redhat.com> wrote: >> Most of this looks good. However, two comments: >> >> 1. I don't like adding a new u16 to the gfs2_inode. I've been working to >> reduce the size of gfs2's inodes lately, so I'd rather see this >> implemented as a new GIF_RAHEAD (or similar) flag in gfs2_inode's >> i_flags. that's how I had this implemented in the previous version, before Steve requested to have it changed. It seems pretty unlikely to me that we'll readahead more than one block here, so I one bit should actually be enough. >> 2. It seems to me like we should take advantage of function gfs2_meta_ra() >> which already submits one block and a variable number of additional >> blocks for read-ahead, then waits for the first block IO to complete. > > The meta_ra thing is a bit of a hack, and best avoided for this use. We want > to only send out a single I/O here rather than let the I/O stack do the > merging after the fact, Well, the first patch might as well use gfs2_meta_ra I guess. I'm unsure whether submitting a single I/O request is worth it given that the code for doing that is quite ugly. If it is, we could make the argument that it's also worth it for gfs2_meta_ra. The block size might make a big difference here as well. Thanks, Andreas ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-01 19:02 [Cluster-devel] [PATCH 0/2] Extended attribute readahead Andreas Gruenbacher 2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher @ 2015-11-01 19:02 ` Andreas Gruenbacher 2015-11-12 13:44 ` Steven Whitehouse 1 sibling, 1 reply; 13+ messages in thread From: Andreas Gruenbacher @ 2015-11-01 19:02 UTC (permalink / raw) To: cluster-devel.redhat.com Instead of submitting separate bio for the inode and its extended attributes, submit a single bio for both when possible. The entire request becomes a normal read, not a readahead. To keep track of the buffer heads that make up the bio, we allocate temporary buffer head arrays: in the endio handler, it would also be possible to compute the buffer head block numbers from the bio and to grab the buffer heads with gfs2_getbuf, but the code would become even messier. --- fs/gfs2/meta_io.c | 94 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0f24828..e650127 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) +struct gfs2_meta_read { + int num; + struct buffer_head *bhs[0]; +}; + +static void gfs2_meta_read_endio(struct bio *bio) { + struct gfs2_meta_read *r = bio->bi_private; + int i; + + for (i = 0; i < r->num; i++) { + struct buffer_head *bh = r->bhs[i]; + + if (unlikely(bio_flagged(bio, BIO_QUIET))) + set_bit(BH_Quiet, &bh->b_state); + + bh->b_end_io(bh, !bio->bi_error); + } + bio_put(bio); + kfree(r); +} + +/* + * (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) { - struct buffer_head *bh; + struct gfs2_meta_read *r; + struct buffer_head *bh = bhs[0]; + struct bio *bio; + int i; - bh = gfs2_getbuf(gl, blkno, 1); - lock_buffer(bh); - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - brelse(bh); + if (!num) + return; + + if (num == 1) { + bh->b_end_io = end_buffer_read_sync; + submit_bh(rw, bh); return; } - bh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META | REQ_PRIO, bh); + + r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]), + GFP_NOIO | __GFP_NOFAIL); + r->num = num; + for (i = 0; i < num; i++) + r->bhs[i] = bhs[i]; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + for (i = 0; i < num; i++) { + bh = bhs[i]; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + } + bio->bi_end_io = gfs2_meta_read_endio; + bio->bi_private = r; + + submit_bio(rw, bio); } /** @@ -216,7 +260,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct buffer_head *bh; + struct buffer_head *bh, *bhs[2]; + int num = 0; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { *bhp = NULL; @@ -228,18 +273,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); - return 0; + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; } - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; + bh = *bhp; wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info; -- 2.5.0 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher @ 2015-11-12 13:44 ` Steven Whitehouse 2015-11-12 15:33 ` Andreas Gruenbacher 0 siblings, 1 reply; 13+ messages in thread From: Steven Whitehouse @ 2015-11-12 13:44 UTC (permalink / raw) To: cluster-devel.redhat.com Hi, On 01/11/15 19:02, Andreas Gruenbacher wrote: > Instead of submitting separate bio for the inode and its extended > attributes, submit a single bio for both when possible. The entire > request becomes a normal read, not a readahead. > > To keep track of the buffer heads that make up the bio, we allocate > temporary buffer head arrays: in the endio handler, it would also be > possible to compute the buffer head block numbers from the bio and to > grab the buffer heads with gfs2_getbuf, but the code would become even > messier. > --- > fs/gfs2/meta_io.c | 94 ++++++++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 76 insertions(+), 18 deletions(-) > > diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c > index 0f24828..e650127 100644 > --- a/fs/gfs2/meta_io.c > +++ b/fs/gfs2/meta_io.c > @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) > return bh; > } > > -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) > +struct gfs2_meta_read { > + int num; > + struct buffer_head *bhs[0]; > +}; > + > +static void gfs2_meta_read_endio(struct bio *bio) { > + struct gfs2_meta_read *r = bio->bi_private; > + int i; > + > + for (i = 0; i < r->num; i++) { > + struct buffer_head *bh = r->bhs[i]; > + > + if (unlikely(bio_flagged(bio, BIO_QUIET))) > + set_bit(BH_Quiet, &bh->b_state); > + > + bh->b_end_io(bh, !bio->bi_error); > + } > + bio_put(bio); > + kfree(r); > +} > + > +/* > + * (See submit_bh_wbc.) > + */ > +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) > { > - struct buffer_head *bh; > + struct gfs2_meta_read *r; > + struct buffer_head *bh = bhs[0]; > + struct bio *bio; > + int i; > > - bh = gfs2_getbuf(gl, blkno, 1); > - lock_buffer(bh); > - if (buffer_uptodate(bh)) { > - unlock_buffer(bh); > - brelse(bh); > + if (!num) > + return; > + > + if (num == 1) { > + bh->b_end_io = end_buffer_read_sync; > + submit_bh(rw, bh); > return; > } > - bh->b_end_io = end_buffer_read_sync; > - submit_bh(READA | REQ_META | REQ_PRIO, bh); > + > + r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]), > + GFP_NOIO | __GFP_NOFAIL); Can we avoid doing this I wonder? I would like to avoid adding GFP_NOFAIL allocations, and I think it should be possible to figure out where the buffers are using the bio itself (i.e. iterating over it, similar to gfs2_end_log_write() in lops.c) so that we don't need to allocate this additional memory. Otherwise this looks like the right approach I think, Steve. > + r->num = num; > + for (i = 0; i < num; i++) > + r->bhs[i] = bhs[i]; > + > + bio = bio_alloc(GFP_NOIO, num); > + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); > + bio->bi_bdev = bh->b_bdev; > + for (i = 0; i < num; i++) { > + bh = bhs[i]; > + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); > + } > + bio->bi_end_io = gfs2_meta_read_endio; > + bio->bi_private = r; > + > + submit_bio(rw, bio); > } > > /** > @@ -216,7 +260,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, > int rahead, struct buffer_head **bhp) > { > struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; > - struct buffer_head *bh; > + struct buffer_head *bh, *bhs[2]; > + int num = 0; > > if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { > *bhp = NULL; > @@ -228,18 +273,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, > lock_buffer(bh); > if (buffer_uptodate(bh)) { > unlock_buffer(bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > - return 0; > + flags &= ~DIO_WAIT; > + } else { > + bh->b_end_io = end_buffer_read_sync; > + get_bh(bh); > + bhs[num++] = bh; > } > - bh->b_end_io = end_buffer_read_sync; > - get_bh(bh); > - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > + > + if (rahead) { > + bh = gfs2_getbuf(gl, blkno + 1, CREATE); > + > + lock_buffer(bh); > + if (buffer_uptodate(bh)) { > + unlock_buffer(bh); > + brelse(bh); > + } else { > + bh->b_end_io = end_buffer_read_sync; > + bhs[num++] = bh; > + } > + } > + > + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); > if (!(flags & DIO_WAIT)) > return 0; > > + bh = *bhp; > wait_on_buffer(bh); > if (unlikely(!buffer_uptodate(bh))) { > struct gfs2_trans *tr = current->journal_info; ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-12 13:44 ` Steven Whitehouse @ 2015-11-12 15:33 ` Andreas Gruenbacher 2015-11-12 20:15 ` Andreas Gruenbacher 0 siblings, 1 reply; 13+ messages in thread From: Andreas Gruenbacher @ 2015-11-12 15:33 UTC (permalink / raw) To: cluster-devel.redhat.com On Thu, Nov 12, 2015 at 2:44 PM, Steven Whitehouse <swhiteho@redhat.com> wrote: > Hi, > > > On 01/11/15 19:02, Andreas Gruenbacher wrote: >> >> Instead of submitting separate bio for the inode and its extended >> attributes, submit a single bio for both when possible. The entire >> request becomes a normal read, not a readahead. >> >> To keep track of the buffer heads that make up the bio, we allocate >> temporary buffer head arrays: in the endio handler, it would also be >> possible to compute the buffer head block numbers from the bio and to >> grab the buffer heads with gfs2_getbuf, but the code would become even >> messier. >> --- >> fs/gfs2/meta_io.c | 94 >> ++++++++++++++++++++++++++++++++++++++++++++----------- >> 1 file changed, 76 insertions(+), 18 deletions(-) >> >> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c >> index 0f24828..e650127 100644 >> --- a/fs/gfs2/meta_io.c >> +++ b/fs/gfs2/meta_io.c >> @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock >> *gl, u64 blkno) >> return bh; >> } >> -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) >> +struct gfs2_meta_read { >> + int num; >> + struct buffer_head *bhs[0]; >> +}; >> + >> +static void gfs2_meta_read_endio(struct bio *bio) { >> + struct gfs2_meta_read *r = bio->bi_private; >> + int i; >> + >> + for (i = 0; i < r->num; i++) { >> + struct buffer_head *bh = r->bhs[i]; >> + >> + if (unlikely(bio_flagged(bio, BIO_QUIET))) >> + set_bit(BH_Quiet, &bh->b_state); >> + >> + bh->b_end_io(bh, !bio->bi_error); >> + } >> + bio_put(bio); >> + kfree(r); >> +} >> + >> +/* >> + * (See submit_bh_wbc.) >> + */ >> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) >> { >> - struct buffer_head *bh; >> + struct gfs2_meta_read *r; >> + struct buffer_head *bh = bhs[0]; >> + struct bio *bio; >> + int i; >> - bh = gfs2_getbuf(gl, blkno, 1); >> - lock_buffer(bh); >> - if (buffer_uptodate(bh)) { >> - unlock_buffer(bh); >> - brelse(bh); >> + if (!num) >> + return; >> + >> + if (num == 1) { >> + bh->b_end_io = end_buffer_read_sync; >> + submit_bh(rw, bh); >> return; >> } >> - bh->b_end_io = end_buffer_read_sync; >> - submit_bh(READA | REQ_META | REQ_PRIO, bh); >> + >> + r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]), >> + GFP_NOIO | __GFP_NOFAIL); > > Can we avoid doing this I wonder? I would like to avoid adding GFP_NOFAIL > allocations, and I think it should be possible to figure out where the > buffers are using the bio itself (i.e. iterating over it, similar to > gfs2_end_log_write() in lops.c) so that we don't need to allocate this > additional memory. Yes, that's better than going through gfs2_getbuf() again. I'll change that. Thanks, Andreas ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-12 15:33 ` Andreas Gruenbacher @ 2015-11-12 20:15 ` Andreas Gruenbacher 2015-11-12 20:33 ` Steven Whitehouse 2015-11-13 13:48 ` Bob Peterson 0 siblings, 2 replies; 13+ messages in thread From: Andreas Gruenbacher @ 2015-11-12 20:15 UTC (permalink / raw) To: cluster-devel.redhat.com Here is an updated version of this patch, please review. Thanks, Andreas -- Instead of submitting a READ_SYNC bio for the inode and a READA bio for the inode's extended attributes through submit_bh, submit a single READ bio for both strough submit_bio when possible. This can be more efficient on some kinds of block devices. Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> --- fs/gfs2/meta_io.c | 81 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0f24828..e137d96 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) +static void gfs2_meta_read_endio(struct bio *bio) { - struct buffer_head *bh; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + struct buffer_head *bh = page_buffers(page); + unsigned int len = bvec->bv_len; + + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + struct buffer_head *next = bh->b_this_page; + len -= bh->b_size; + bh->b_end_io(bh, !bio->bi_error); + bh = next; + } while (bh && len); + } + bio_put(bio); +} - bh = gfs2_getbuf(gl, blkno, 1); - lock_buffer(bh); - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - brelse(bh); +/* + * Submit several consecutive buffer head I/O requests as a single bio I/O + * request. (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) +{ + struct buffer_head *bh = bhs[0]; + struct bio *bio; + int i; + + if (!num) return; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + for (i = 0; i < num; i++) { + bh = bhs[i]; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } - bh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META | REQ_PRIO, bh); + bio->bi_end_io = gfs2_meta_read_endio; + submit_bio(rw, bio); } /** @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct buffer_head *bh; + struct buffer_head *bh, *bhs[2]; + int num = 0; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { *bhp = NULL; @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); - return 0; + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; } - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; + bh = *bhp; wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info; -- 2.5.0 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-12 20:15 ` Andreas Gruenbacher @ 2015-11-12 20:33 ` Steven Whitehouse 2015-11-13 22:24 ` Andreas Gruenbacher 2015-11-13 13:48 ` Bob Peterson 1 sibling, 1 reply; 13+ messages in thread From: Steven Whitehouse @ 2015-11-12 20:33 UTC (permalink / raw) To: cluster-devel.redhat.com Hi, On 12/11/15 20:15, Andreas Gruenbacher wrote: > Here is an updated version of this patch, please review. > > Thanks, > Andreas That looks good to me, and I assume that it should be faster too? Steve. > > -- > > Instead of submitting a READ_SYNC bio for the inode and a READA bio for > the inode's extended attributes through submit_bh, submit a single READ > bio for both strough submit_bio when possible. This can be more > efficient on some kinds of block devices. > > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> > --- > fs/gfs2/meta_io.c | 81 ++++++++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 63 insertions(+), 18 deletions(-) > > diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c > index 0f24828..e137d96 100644 > --- a/fs/gfs2/meta_io.c > +++ b/fs/gfs2/meta_io.c > @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) > return bh; > } > > -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) > +static void gfs2_meta_read_endio(struct bio *bio) > { > - struct buffer_head *bh; > + struct bio_vec *bvec; > + int i; > + > + bio_for_each_segment_all(bvec, bio, i) { > + struct page *page = bvec->bv_page; > + struct buffer_head *bh = page_buffers(page); > + unsigned int len = bvec->bv_len; > + > + while (bh_offset(bh) < bvec->bv_offset) > + bh = bh->b_this_page; > + do { > + struct buffer_head *next = bh->b_this_page; > + len -= bh->b_size; > + bh->b_end_io(bh, !bio->bi_error); > + bh = next; > + } while (bh && len); > + } > + bio_put(bio); > +} > > - bh = gfs2_getbuf(gl, blkno, 1); > - lock_buffer(bh); > - if (buffer_uptodate(bh)) { > - unlock_buffer(bh); > - brelse(bh); > +/* > + * Submit several consecutive buffer head I/O requests as a single bio I/O > + * request. (See submit_bh_wbc.) > + */ > +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) > +{ > + struct buffer_head *bh = bhs[0]; > + struct bio *bio; > + int i; > + > + if (!num) > return; > + > + bio = bio_alloc(GFP_NOIO, num); > + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); > + bio->bi_bdev = bh->b_bdev; > + for (i = 0; i < num; i++) { > + bh = bhs[i]; > + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); > } > - bh->b_end_io = end_buffer_read_sync; > - submit_bh(READA | REQ_META | REQ_PRIO, bh); > + bio->bi_end_io = gfs2_meta_read_endio; > + submit_bio(rw, bio); > } > > /** > @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, > int rahead, struct buffer_head **bhp) > { > struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; > - struct buffer_head *bh; > + struct buffer_head *bh, *bhs[2]; > + int num = 0; > > if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { > *bhp = NULL; > @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, > lock_buffer(bh); > if (buffer_uptodate(bh)) { > unlock_buffer(bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > - return 0; > + flags &= ~DIO_WAIT; > + } else { > + bh->b_end_io = end_buffer_read_sync; > + get_bh(bh); > + bhs[num++] = bh; > } > - bh->b_end_io = end_buffer_read_sync; > - get_bh(bh); > - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > + > + if (rahead) { > + bh = gfs2_getbuf(gl, blkno + 1, CREATE); > + > + lock_buffer(bh); > + if (buffer_uptodate(bh)) { > + unlock_buffer(bh); > + brelse(bh); > + } else { > + bh->b_end_io = end_buffer_read_sync; > + bhs[num++] = bh; > + } > + } > + > + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); > if (!(flags & DIO_WAIT)) > return 0; > > + bh = *bhp; > wait_on_buffer(bh); > if (unlikely(!buffer_uptodate(bh))) { > struct gfs2_trans *tr = current->journal_info; ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-12 20:33 ` Steven Whitehouse @ 2015-11-13 22:24 ` Andreas Gruenbacher 2015-11-16 18:14 ` Bob Peterson 0 siblings, 1 reply; 13+ messages in thread From: Andreas Gruenbacher @ 2015-11-13 22:24 UTC (permalink / raw) To: cluster-devel.redhat.com On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehouse <swhiteho@redhat.com> wrote: > That looks good to me, and I assume that it should be faster too? I did some tests with a directory tree with 3439 directories and 51556 files in it. In that tree, 47313 or 86% of the 54995 files and directories ended up with readahead enabled, while 7682 didn't. On a slow SATA magnetic disc, I compared the time required for running "find", then "find | xargs stat", then "find | xargs getfattr". I booted with selinux=0 for these measurements, so "find" and "find | xargs stat" didn't use the xattrs. The results are attached as a spreadsheet. They show that xattr readahead can save quite some time (here, about -75%). The optimized version of xattr readahead is about 3% faster than the simple version in this test, so I'll leave it up to you whether that's worth it for you. It makes no difference whether the scheduler is "noop" or "deadline" here; the "noop" scheduler still seems to merge adjacent I/O requests (which makes sense). Andreas -------------- next part -------------- A non-text attachment was scrubbed... Name: gfs2-readahead.ods Type: application/vnd.oasis.opendocument.spreadsheet Size: 11411 bytes Desc: not available URL: <http://listman.redhat.com/archives/cluster-devel/attachments/20151113/9323ca98/attachment.ods> ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-13 22:24 ` Andreas Gruenbacher @ 2015-11-16 18:14 ` Bob Peterson 0 siblings, 0 replies; 13+ messages in thread From: Bob Peterson @ 2015-11-16 18:14 UTC (permalink / raw) To: cluster-devel.redhat.com ----- Original Message ----- > On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehouse <swhiteho@redhat.com> > wrote: > > That looks good to me, and I assume that it should be faster too? > > I did some tests with a directory tree with 3439 directories and 51556 > files in it. > > In that tree, 47313 or 86% of the 54995 files and directories ended up > with readahead enabled, while 7682 didn't. > > On a slow SATA magnetic disc, I compared the time required for running > "find", then "find | xargs stat", then "find | xargs getfattr". I > booted with selinux=0 for these measurements, so "find" and "find | > xargs stat" didn't use the xattrs. > > The results are attached as a spreadsheet. They show that xattr > readahead can save quite some time (here, about -75%). The optimized > version of xattr readahead is about 3% faster than the simple version > in this test, so I'll leave it up to you whether that's worth it for > you. > > It makes no difference whether the scheduler is "noop" or "deadline" > here; the "noop" scheduler still seems to merge adjacent I/O requests > (which makes sense). > > Andreas > Hi, Thanks. Both patches are now applied to the for-next branch of the linux-gfs2 tree: https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next&id=c8d577038449a718ad0027d1790b6ef4441715d4 https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next&id=843396ad9886458bddbc7d07137b2975876b029e Regards, Bob Peterson Red Hat File Systems ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization 2015-11-12 20:15 ` Andreas Gruenbacher 2015-11-12 20:33 ` Steven Whitehouse @ 2015-11-13 13:48 ` Bob Peterson 1 sibling, 0 replies; 13+ messages in thread From: Bob Peterson @ 2015-11-13 13:48 UTC (permalink / raw) To: cluster-devel.redhat.com ----- Original Message ----- > Here is an updated version of this patch, please review. > > Thanks, > Andreas > > -- > > Instead of submitting a READ_SYNC bio for the inode and a READA bio for > the inode's extended attributes through submit_bh, submit a single READ > bio for both strough submit_bio when possible. This can be more > efficient on some kinds of block devices. > > Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> > --- > fs/gfs2/meta_io.c | 81 > ++++++++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 63 insertions(+), 18 deletions(-) > > diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c > index 0f24828..e137d96 100644 > --- a/fs/gfs2/meta_io.c > +++ b/fs/gfs2/meta_io.c > @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock > *gl, u64 blkno) > return bh; > } > > -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) > +static void gfs2_meta_read_endio(struct bio *bio) > { > - struct buffer_head *bh; > + struct bio_vec *bvec; > + int i; > + > + bio_for_each_segment_all(bvec, bio, i) { > + struct page *page = bvec->bv_page; > + struct buffer_head *bh = page_buffers(page); > + unsigned int len = bvec->bv_len; > + > + while (bh_offset(bh) < bvec->bv_offset) > + bh = bh->b_this_page; > + do { > + struct buffer_head *next = bh->b_this_page; > + len -= bh->b_size; > + bh->b_end_io(bh, !bio->bi_error); > + bh = next; > + } while (bh && len); > + } > + bio_put(bio); > +} > > - bh = gfs2_getbuf(gl, blkno, 1); > - lock_buffer(bh); > - if (buffer_uptodate(bh)) { > - unlock_buffer(bh); > - brelse(bh); > +/* > + * Submit several consecutive buffer head I/O requests as a single bio I/O > + * request. (See submit_bh_wbc.) > + */ > +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) > +{ > + struct buffer_head *bh = bhs[0]; > + struct bio *bio; > + int i; > + > + if (!num) > return; > + > + bio = bio_alloc(GFP_NOIO, num); > + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); > + bio->bi_bdev = bh->b_bdev; > + for (i = 0; i < num; i++) { > + bh = bhs[i]; > + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); > } > - bh->b_end_io = end_buffer_read_sync; > - submit_bh(READA | REQ_META | REQ_PRIO, bh); > + bio->bi_end_io = gfs2_meta_read_endio; > + submit_bio(rw, bio); > } > > /** > @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int > flags, > int rahead, struct buffer_head **bhp) > { > struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; > - struct buffer_head *bh; > + struct buffer_head *bh, *bhs[2]; > + int num = 0; > > if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { > *bhp = NULL; > @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, > int flags, > lock_buffer(bh); > if (buffer_uptodate(bh)) { > unlock_buffer(bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > - return 0; > + flags &= ~DIO_WAIT; > + } else { > + bh->b_end_io = end_buffer_read_sync; > + get_bh(bh); > + bhs[num++] = bh; > } > - bh->b_end_io = end_buffer_read_sync; > - get_bh(bh); > - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > + > + if (rahead) { > + bh = gfs2_getbuf(gl, blkno + 1, CREATE); > + > + lock_buffer(bh); > + if (buffer_uptodate(bh)) { > + unlock_buffer(bh); > + brelse(bh); > + } else { > + bh->b_end_io = end_buffer_read_sync; > + bhs[num++] = bh; > + } > + } > + > + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); > if (!(flags & DIO_WAIT)) > return 0; > > + bh = *bhp; > wait_on_buffer(bh); > if (unlikely(!buffer_uptodate(bh))) { > struct gfs2_trans *tr = current->journal_info; > -- > 2.5.0 > > Hi, ACK to both patches Looks good. I'll hold onto them until this merge window closes. Regards, Bob Peterson Red Hat File Systems ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2015-11-16 18:14 UTC | newest] Thread overview: 13+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2015-11-01 19:02 [Cluster-devel] [PATCH 0/2] Extended attribute readahead Andreas Gruenbacher 2015-11-01 19:02 ` [Cluster-devel] [PATCH 1/2] gfs2: " Andreas Gruenbacher 2015-11-03 17:29 ` Bob Peterson 2015-11-03 19:02 ` Steven Whitehouse 2015-11-03 20:18 ` Andreas Gruenbacher 2015-11-01 19:02 ` [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization Andreas Gruenbacher 2015-11-12 13:44 ` Steven Whitehouse 2015-11-12 15:33 ` Andreas Gruenbacher 2015-11-12 20:15 ` Andreas Gruenbacher 2015-11-12 20:33 ` Steven Whitehouse 2015-11-13 22:24 ` Andreas Gruenbacher 2015-11-16 18:14 ` Bob Peterson 2015-11-13 13:48 ` Bob Peterson
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).