From mboxrd@z Thu Jan 1 00:00:00 1970 From: Steven Whitehouse Date: Fri, 10 Aug 2012 12:19:42 +0100 Subject: [Cluster-devel] [GFS2 PATCH] GFS2: Break ordered_write list by rgrp for faster sorting In-Reply-To: <934954884.4875349.1344539896576.JavaMail.root@redhat.com> References: <934954884.4875349.1344539896576.JavaMail.root@redhat.com> Message-ID: <1344597583.2717.6.camel@menhir> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi, While it makes sense to accumulate the ordered write buffers on per-rgrp lists so that they are partially pre-sorted, I'm less convinced that there is a need to retain a per-rgrp "writing" list, since we must traverse all of those items anyway. By keeping that as a global list we then do not have the added complication of traversing the resource groups. This might be a considerable win on a large filesystem with only a very few dirty data blocks. Also, it would make much more sense to use a per-rgrp lock on the per-rgrp ordered list in order to get the most benefit from this change, Steve. On Thu, 2012-08-09 at 15:18 -0400, Bob Peterson wrote: > Hi, > > This patch moves the ordered_write buffer list from the superblock > to the rgrps. That makes for several lists that are smaller, and > therefore faster to find and sort. > > Regards, > > Bob Peterson > Red Hat File Systems > > Signed-off-by: Bob Peterson > --- > diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c > index 01c4975..3708e39 100644 > --- a/fs/gfs2/aops.c > +++ b/fs/gfs2/aops.c > @@ -1083,8 +1083,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) > bh->b_private = NULL; > } > gfs2_log_unlock(sdp); > - if (bd) > + if (bd) { > + BUG_ON(!list_empty(&bd->bd_list)); > kmem_cache_free(gfs2_bufdata_cachep, bd); > + } > > bh = bh->b_this_page; > } while (bh != head); > diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h > index 99d7c64..d380152 100644 > --- a/fs/gfs2/incore.h > +++ b/fs/gfs2/incore.h > @@ -97,6 +97,8 @@ struct gfs2_rgrpd { > #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ > #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ > #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ > + struct list_head rd_log_le_ordered; > + struct list_head rd_log_le_writing; > spinlock_t rd_rsspin; /* protects reservation related vars */ > struct rb_root rd_rstree; /* multi-block reservation tree */ > u32 rd_rs_cnt; /* count of current reservations */ > @@ -723,7 +725,6 @@ struct gfs2_sbd { > struct list_head sd_log_le_buf; > struct list_head sd_log_le_revoke; > struct list_head sd_log_le_databuf; > - struct list_head sd_log_le_ordered; > > atomic_t sd_log_thresh1; > atomic_t sd_log_thresh2; > diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c > index f4beeb9..ce3e86d 100644 > --- a/fs/gfs2/log.c > +++ b/fs/gfs2/log.c > @@ -28,6 +28,7 @@ > #include "log.h" > #include "lops.h" > #include "meta_io.h" > +#include "rgrp.h" > #include "util.h" > #include "dir.h" > #include "trace_gfs2.h" > @@ -500,29 +501,38 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) > { > struct gfs2_bufdata *bd; > struct buffer_head *bh; > - LIST_HEAD(written); > + struct gfs2_rgrpd *rgd, *rg1; > > gfs2_log_lock(sdp); > - list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp); > - while (!list_empty(&sdp->sd_log_le_ordered)) { > - bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list); > - list_move(&bd->bd_list, &written); > - bh = bd->bd_bh; > - if (!buffer_dirty(bh)) > - continue; > - get_bh(bh); > - gfs2_log_unlock(sdp); > - lock_buffer(bh); > - if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { > - bh->b_end_io = end_buffer_write_sync; > - submit_bh(WRITE_SYNC, bh); > - } else { > - unlock_buffer(bh); > - brelse(bh); > + rgd = rg1 = gfs2_rgrpd_get_first(sdp); > + while (rgd) { > + if (!list_empty(&rgd->rd_log_le_ordered) && > + !list_is_last(rgd->rd_log_le_ordered.next, > + &rgd->rd_log_le_ordered)) > + list_sort(NULL, &rgd->rd_log_le_ordered, &bd_cmp); > + while (!list_empty(&rgd->rd_log_le_ordered)) { > + bd = list_entry(rgd->rd_log_le_ordered.next, > + struct gfs2_bufdata, bd_list); > + list_move(&bd->bd_list, &rgd->rd_log_le_writing); > + bh = bd->bd_bh; > + if (!buffer_dirty(bh)) > + continue; > + get_bh(bh); > + gfs2_log_unlock(sdp); > + lock_buffer(bh); > + if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { > + bh->b_end_io = end_buffer_write_sync; > + submit_bh(WRITE_SYNC, bh); > + } else { > + unlock_buffer(bh); > + brelse(bh); > + } > + gfs2_log_lock(sdp); > } > - gfs2_log_lock(sdp); > + rgd = gfs2_rgrpd_get_next(rgd); > + if (rgd == rg1) > + break; > } > - list_splice(&written, &sdp->sd_log_le_ordered); > gfs2_log_unlock(sdp); > } > > @@ -530,20 +540,28 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) > { > struct gfs2_bufdata *bd; > struct buffer_head *bh; > + struct gfs2_rgrpd *rgd, *rg1; > > gfs2_log_lock(sdp); > - while (!list_empty(&sdp->sd_log_le_ordered)) { > - bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list); > - bh = bd->bd_bh; > - if (buffer_locked(bh)) { > - get_bh(bh); > - gfs2_log_unlock(sdp); > - wait_on_buffer(bh); > - brelse(bh); > - gfs2_log_lock(sdp); > - continue; > + rgd = rg1 = gfs2_rgrpd_get_first(sdp); > + while (rgd) { > + while (!list_empty(&rgd->rd_log_le_writing)) { > + bd = list_entry(rgd->rd_log_le_writing.prev, > + struct gfs2_bufdata, bd_list); > + list_del_init(&bd->bd_list); > + bh = bd->bd_bh; > + if (bh && buffer_locked(bh)) { > + get_bh(bh); > + gfs2_log_unlock(sdp); > + wait_on_buffer(bh); > + brelse(bh); > + gfs2_log_lock(sdp); > + continue; > + } > } > - list_del_init(&bd->bd_list); > + rgd = gfs2_rgrpd_get_next(rgd); > + if (rgd == rg1) > + break; > } > gfs2_log_unlock(sdp); > } > diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c > index 8ff95a2..39c483e 100644 > --- a/fs/gfs2/lops.c > +++ b/fs/gfs2/lops.c > @@ -672,6 +672,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) > gl = bd->bd_gl; > atomic_dec(&gl->gl_revokes); > clear_bit(GLF_LFLUSH, &gl->gl_flags); > + BUG_ON(!list_empty(&bd->bd_list)); > kmem_cache_free(gfs2_bufdata_cachep, bd); > } > } > @@ -776,6 +777,7 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) > struct gfs2_trans *tr = current->journal_info; > struct address_space *mapping = bd->bd_bh->b_page->mapping; > struct gfs2_inode *ip = GFS2_I(mapping->host); > + struct gfs2_rgrpd *rgd; > > lock_buffer(bd->bd_bh); > gfs2_log_lock(sdp); > @@ -791,7 +793,13 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) > sdp->sd_log_num_databuf++; > list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf); > } else { > - list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered); > + if (ip->i_rgd && > + rgrp_contains_block(ip->i_rgd, bd->bd_bh->b_blocknr)) > + rgd = ip->i_rgd; > + else > + rgd = gfs2_blk2rgrpd(sdp, bd->bd_bh->b_blocknr, 1); > + BUG_ON(rgd == NULL); > + list_add_tail(&bd->bd_list, &rgd->rd_log_le_ordered); > } > out: > gfs2_log_unlock(sdp); > diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c > index e5af9dc..6b72d07 100644 > --- a/fs/gfs2/ops_fstype.c > +++ b/fs/gfs2/ops_fstype.c > @@ -100,7 +100,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) > INIT_LIST_HEAD(&sdp->sd_log_le_buf); > INIT_LIST_HEAD(&sdp->sd_log_le_revoke); > INIT_LIST_HEAD(&sdp->sd_log_le_databuf); > - INIT_LIST_HEAD(&sdp->sd_log_le_ordered); > > init_waitqueue_head(&sdp->sd_log_waitq); > init_waitqueue_head(&sdp->sd_logd_waitq); > diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c > index 47d2346..f9b2baf 100644 > --- a/fs/gfs2/rgrp.c > +++ b/fs/gfs2/rgrp.c > @@ -331,13 +331,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) > } > } > > -static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) > -{ > - u64 first = rgd->rd_data0; > - u64 last = first + rgd->rd_data; > - return first <= block && block < last; > -} > - > /** > * gfs2_blk2rgrpd - Find resource group for a given data/meta block number > * @sdp: The GFS2 superblock > @@ -754,6 +747,8 @@ static int read_rindex_entry(struct gfs2_inode *ip) > rgd->rd_data0 = be64_to_cpu(buf.ri_data0); > rgd->rd_data = be32_to_cpu(buf.ri_data); > rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); > + INIT_LIST_HEAD(&rgd->rd_log_le_ordered); > + INIT_LIST_HEAD(&rgd->rd_log_le_writing); > spin_lock_init(&rgd->rd_rsspin); > > error = compute_bitstructs(rgd); > diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h > index c98f6af..a379320 100644 > --- a/fs/gfs2/rgrp.h > +++ b/fs/gfs2/rgrp.h > @@ -73,6 +73,13 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, > const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); > extern int gfs2_fitrim(struct file *filp, void __user *argp); > > +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) > +{ > + u64 first = rgd->rd_data0; > + u64 last = first + rgd->rd_data; > + return first <= block && block < last; > +} > + > /* This is how to tell if a reservation is in the rgrp tree: */ > static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) > { > diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c > index adbd278..724f724 100644 > --- a/fs/gfs2/trans.c > +++ b/fs/gfs2/trans.c > @@ -186,6 +186,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) > list_del_init(&bd->bd_list); > gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); > sdp->sd_log_num_revoke--; > + BUG_ON(!list_empty(&bd->bd_list)); > kmem_cache_free(gfs2_bufdata_cachep, bd); > tr->tr_num_revoke_rm++; > if (--n == 0) >