From mboxrd@z Thu Jan 1 00:00:00 1970 From: Steven Whitehouse Date: Mon, 8 Oct 2018 11:33:00 +0100 Subject: [Cluster-devel] [PATCH 11/11] gfs2: Add local resource group locking In-Reply-To: <20181005191854.2566-12-agruenba@redhat.com> References: <20181005191854.2566-1-agruenba@redhat.com> <20181005191854.2566-12-agruenba@redhat.com> Message-ID: <11f154eb-9e66-c06a-ffce-1aae594919c8@redhat.com> List-Id: To: cluster-devel.redhat.com MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Hi, On 05/10/18 20:18, Andreas Gruenbacher wrote: > From: Bob Peterson > > Prepare for treating resource group glocks as exclusive among nodes but > shared among all tasks running on a node: introduce another layer of > node-specific locking that the local tasks can use to coordinate their > accesses. > > This patch only introduces the local locking changes necessary so that > future patches can introduce resource group glock sharing. We replace > the resource group spinlock with a mutex; whether that leads to > noticeable additional contention on the resource group mutex remains to > be seen. > > Signed-off-by: Andreas Gruenbacher > --- > fs/gfs2/incore.h | 3 +- > fs/gfs2/lops.c | 5 ++- > fs/gfs2/rgrp.c | 97 +++++++++++++++++++++++++++++++++++------------- > fs/gfs2/rgrp.h | 4 ++ > 4 files changed, 81 insertions(+), 28 deletions(-) > > diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h > index 932e63924f7e..2fa47b476eef 100644 > --- a/fs/gfs2/incore.h > +++ b/fs/gfs2/incore.h > @@ -23,6 +23,7 @@ > #include > #include > #include > +#include > > #define DIO_WAIT 0x00000010 > #define DIO_METADATA 0x00000020 > @@ -120,7 +121,7 @@ struct gfs2_rgrpd { > #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ > #define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ > #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ > - spinlock_t rd_rsspin; /* protects reservation related vars */ > + struct mutex rd_lock; I can see why we might need to have additional local rgrp locking, but why do we need to make the sd_rsspin into a mutex? I'm wondering if these should not be separate locks still? Steve. > struct rb_root rd_rstree; /* multi-block reservation tree */ > }; > > diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c > index 4c7069b8f3c1..a9e858e01c97 100644 > --- a/fs/gfs2/lops.c > +++ b/fs/gfs2/lops.c > @@ -76,8 +76,9 @@ static void maybe_release_space(struct gfs2_bufdata *bd) > unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; > struct gfs2_bitmap *bi = rgd->rd_bits + index; > > + rgrp_lock_local(rgd); > if (bi->bi_clone == NULL) > - return; > + goto out; > if (sdp->sd_args.ar_discard) > gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); > memcpy(bi->bi_clone + bi->bi_offset, > @@ -85,6 +86,8 @@ static void maybe_release_space(struct gfs2_bufdata *bd) > clear_bit(GBF_FULL, &bi->bi_flags); > rgd->rd_free_clone = rgd->rd_free; > rgd->rd_extfail_pt = rgd->rd_free; > +out: > + rgrp_unlock_local(rgd); > } > > /** > diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c > index 8a6b41f3667c..a89be4782c15 100644 > --- a/fs/gfs2/rgrp.c > +++ b/fs/gfs2/rgrp.c > @@ -702,10 +702,10 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) > > rgd = rs->rs_rgd; > if (rgd) { > - spin_lock(&rgd->rd_rsspin); > + rgrp_lock_local(rgd); > __rs_deltree(rs); > BUG_ON(rs->rs_free); > - spin_unlock(&rgd->rd_rsspin); > + rgrp_unlock_local(rgd); > } > } > > @@ -737,12 +737,12 @@ static void return_all_reservations(struct gfs2_rgrpd *rgd) > struct rb_node *n; > struct gfs2_blkreserv *rs; > > - spin_lock(&rgd->rd_rsspin); > + rgrp_lock_local(rgd); > while ((n = rb_first(&rgd->rd_rstree))) { > rs = rb_entry(n, struct gfs2_blkreserv, rs_node); > __rs_deltree(rs); > } > - spin_unlock(&rgd->rd_rsspin); > + rgrp_unlock_local(rgd); > } > > void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) > @@ -948,7 +948,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) > rgd->rd_data0 = be64_to_cpu(buf.ri_data0); > rgd->rd_data = be32_to_cpu(buf.ri_data); > rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); > - spin_lock_init(&rgd->rd_rsspin); > + mutex_init(&rgd->rd_lock); > > error = compute_bitstructs(rgd); > if (error) > @@ -1469,9 +1469,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp) > /* Trim each bitmap in the rgrp */ > for (x = 0; x < rgd->rd_length; x++) { > struct gfs2_bitmap *bi = rgd->rd_bits + x; > + rgrp_lock_local(rgd); > ret = gfs2_rgrp_send_discards(sdp, > rgd->rd_data0, NULL, bi, minlen, > &amt); > + rgrp_unlock_local(rgd); > if (ret) { > gfs2_glock_dq_uninit(&gh); > goto out; > @@ -1483,9 +1485,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp) > ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); > if (ret == 0) { > bh = rgd->rd_bits[0].bi_bh; > + rgrp_lock_local(rgd); > rgd->rd_flags |= GFS2_RGF_TRIMMED; > gfs2_trans_add_meta(rgd->rd_gl, bh); > gfs2_rgrp_out(rgd, bh->b_data); > + rgrp_unlock_local(rgd); > gfs2_trans_end(sdp); > } > } > @@ -1519,7 +1523,6 @@ static void rs_insert(struct gfs2_inode *ip) > > BUG_ON(gfs2_rs_active(rs)); > > - spin_lock(&rgd->rd_rsspin); > newn = &rgd->rd_rstree.rb_node; > while (*newn) { > struct gfs2_blkreserv *cur = > @@ -1532,7 +1535,6 @@ static void rs_insert(struct gfs2_inode *ip) > else if (rc < 0) > newn = &((*newn)->rb_left); > else { > - spin_unlock(&rgd->rd_rsspin); > WARN_ON(1); > return; > } > @@ -1540,7 +1542,6 @@ static void rs_insert(struct gfs2_inode *ip) > > rb_link_node(&rs->rs_node, parent, newn); > rb_insert_color(&rs->rs_node, &rgd->rd_rstree); > - spin_unlock(&rgd->rd_rsspin); > trace_gfs2_rs(rs, TRACE_RS_INSERT); > } > > @@ -1632,7 +1633,6 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, > struct rb_node *n; > int rc; > > - spin_lock(&rgd->rd_rsspin); > n = rgd->rd_rstree.rb_node; > while (n) { > rs = rb_entry(n, struct gfs2_blkreserv, rs_node); > @@ -1655,7 +1655,6 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, > } > } > > - spin_unlock(&rgd->rd_rsspin); > return block; > } > > @@ -1857,7 +1856,22 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip > struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; > > while (1) { > + /* > + * We must be careful to avoid deadlock here: > + * All transactions expect: sd_log_flush_lock followed by rgrp > + * ex (if needed), but try_rgrp_unlink takes sd_log_flush_lock > + * outside a transaction and therefore must not have the rgrp > + * ex already held. To avoid deadlock, we drop the rgrp ex lock > + * before taking the log_flush_lock, then reacquire it to > + * protect our call to gfs2_rbm_find. > + * > + * Also note that rgrp_unlock_local must come AFTER the caller does > + * gfs2_rs_deltree because rgrp ex needs to be held before > + * making reservations. > + */ > + rgrp_unlock_local(rgd); > down_write(&sdp->sd_log_flush_lock); > + rgrp_lock_local(rgd); > error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, > true); > up_write(&sdp->sd_log_flush_lock); > @@ -2055,7 +2069,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) > struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); > struct gfs2_rgrpd *begin = NULL; > struct gfs2_blkreserv *rs = &ip->i_res; > - int error = 0, rg_locked, flags = 0; > + int error = 0, flags = 0; > + bool rg_locked; > u64 last_unlinked = NO_BLOCK; > int loops = 0; > u32 free_blocks, skip = 0; > @@ -2081,10 +2096,10 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) > return -EBADSLT; > > while (loops < 3) { > - rg_locked = 1; > - > - if (!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) { > - rg_locked = 0; > + rg_locked = gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl); > + if (rg_locked) { > + rgrp_lock_local(rs->rs_rgd); > + } else { > if (skip && skip--) > goto next_rgrp; > if (!gfs2_rs_active(rs)) { > @@ -2101,12 +2116,14 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) > &ip->i_rgd_gh); > if (unlikely(error)) > return error; > + rgrp_lock_local(rs->rs_rgd); > if (!gfs2_rs_active(rs) && (loops < 2) && > gfs2_rgrp_congested(rs->rs_rgd, loops)) > goto skip_rgrp; > if (sdp->sd_args.ar_rgrplvb) { > error = update_rgrp_lvb(rs->rs_rgd); > if (unlikely(error)) { > + rgrp_unlock_local(rs->rs_rgd); > gfs2_glock_dq_uninit(&ip->i_rgd_gh); > return error; > } > @@ -2140,9 +2157,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) > rs->rs_reserved = ap->target; > if (rs->rs_reserved > free_blocks) > rs->rs_reserved = free_blocks; > - spin_lock(&rs->rs_rgd->rd_rsspin); > rgd->rd_reserved += rs->rs_reserved; > - spin_unlock(&rs->rs_rgd->rd_rsspin); > + rgrp_unlock_local(rs->rs_rgd); > return 0; > } > check_rgrp: > @@ -2151,6 +2167,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) > try_rgrp_unlink(rs->rs_rgd, &last_unlinked, > ip->i_no_addr); > skip_rgrp: > + rgrp_unlock_local(rs->rs_rgd); > + > /* Drop reservation, if we couldn't use reserved rgrp */ > if (gfs2_rs_active(rs)) > gfs2_rs_deltree(rs); > @@ -2199,10 +2217,10 @@ void gfs2_inplace_release(struct gfs2_inode *ip) > if (rs->rs_reserved) { > struct gfs2_rgrpd *rgd = rs->rs_rgd; > > - spin_lock(&rgd->rd_rsspin); > + rgrp_lock_local(rgd); > BUG_ON(rgd->rd_reserved < rs->rs_reserved); > rgd->rd_reserved -= rs->rs_reserved; > - spin_unlock(&rs->rs_rgd->rd_rsspin); > + rgrp_unlock_local(rgd); > rs->rs_reserved = 0; > } > if (gfs2_holder_initialized(&ip->i_rgd_gh)) > @@ -2304,12 +2322,12 @@ void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) > be32_to_cpu(rgl->rl_free), > be32_to_cpu(rgl->rl_dinodes)); > } > - spin_lock(&rgd->rd_rsspin); > + rgrp_lock_local(rgd); > for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { > trs = rb_entry(n, struct gfs2_blkreserv, rs_node); > dump_rs(seq, trs); > } > - spin_unlock(&rgd->rd_rsspin); > + rgrp_unlock_local(rgd); > } > > static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) > @@ -2339,7 +2357,6 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, > struct gfs2_blkreserv *rs = &ip->i_res; > struct gfs2_rgrpd *rgd = rbm->rgd; > > - spin_lock(&rgd->rd_rsspin); > BUG_ON(rs->rs_reserved < len); > rgd->rd_reserved -= len; > rs->rs_reserved -= len; > @@ -2355,15 +2372,13 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, > trace_gfs2_rs(rs, TRACE_RS_CLAIM); > if (rs->rs_start < rgd->rd_data0 + rgd->rd_data && > rs->rs_free) > - goto out; > + return; > /* We used up our block reservation, so we should > reserve more blocks next time. */ > atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint); > } > __rs_deltree(rs); > } > -out: > - spin_unlock(&rgd->rd_rsspin); > } > > /** > @@ -2416,6 +2431,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, > > BUG_ON(ip->i_res.rs_reserved < *nblocks); > > + rgrp_lock_local(rbm.rgd); > gfs2_set_alloc_start(&rbm, ip, dinode); > error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); > > @@ -2460,6 +2476,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, > } > > rbm.rgd->rd_free -= *nblocks; > + rbm.rgd->rd_free_clone -= *nblocks; > if (dinode) { > rbm.rgd->rd_dinodes++; > *generation = rbm.rgd->rd_igeneration++; > @@ -2469,6 +2486,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, > > gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); > gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); > + rgrp_unlock_local(rbm.rgd); > > gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); > if (dinode) > @@ -2476,13 +2494,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, > > gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); > > - rbm.rgd->rd_free_clone -= *nblocks; > trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, > dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); > *bn = block; > return 0; > > rgrp_error: > + rgrp_unlock_local(rbm.rgd); > gfs2_rgrp_error(rbm.rgd); > return -EIO; > } > @@ -2502,12 +2520,14 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, > { > struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); > > + rgrp_lock_local(rgd); > rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE); > trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); > rgd->rd_free += blen; > rgd->rd_flags &= ~GFS2_RGF_TRIMMED; > gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); > gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); > + rgrp_unlock_local(rgd); > > /* Directories keep their data in the metadata address space */ > if (meta || ip->i_depth) > @@ -2543,17 +2563,20 @@ void gfs2_unlink_di(struct inode *inode) > rgd = gfs2_blk2rgrpd(sdp, blkno, true); > if (!rgd) > return; > + rgrp_lock_local(rgd); > rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED); > trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); > gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); > gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); > be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1); > + rgrp_unlock_local(rgd); > } > > void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) > { > struct gfs2_sbd *sdp = rgd->rd_sbd; > > + rgrp_lock_local(rgd); > rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); > if (!rgd->rd_dinodes) > gfs2_consist_rgrpd(rgd); > @@ -2562,6 +2585,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) > > gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); > gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); > + rgrp_unlock_local(rgd); > be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1); > > gfs2_statfs_change(sdp, 0, +1, -1); > @@ -2576,6 +2600,10 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) > * @no_addr: The block number to check > * @type: The block type we are looking for > * > + * The inode glock of @no_addr must be held. The @type to check for is either > + * GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; checking for type GFS2_BLKST_FREE > + * or GFS2_BLKST_USED would make no sense. > + * > * Returns: 0 if the block type matches the expected type > * -ESTALE if it doesn't match > * or -ve errno if something went wrong while checking > @@ -2601,6 +2629,12 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) > if (WARN_ON_ONCE(error)) > goto fail; > > + /* > + * No need to take the local resource group lock here; the inode glock > + * of @no_addr provides the necessary synchronization in case the block > + * is an inode. (In case the block is not an inode, the block type > + * will not match the @type we are looking for.) > + */ > if (gfs2_testbit(&rbm, false) != type) > error = -ESTALE; > > @@ -2723,3 +2757,14 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) > } > } > > +void rgrp_lock_local(struct gfs2_rgrpd *rgd) > +{ > + BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) && > + !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags)); > + mutex_lock(&rgd->rd_lock); > +} > + > +void rgrp_unlock_local(struct gfs2_rgrpd *rgd) > +{ > + mutex_unlock(&rgd->rd_lock); > +} > diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h > index b596c3d17988..33e52dab76ef 100644 > --- a/fs/gfs2/rgrp.h > +++ b/fs/gfs2/rgrp.h > @@ -92,4 +92,8 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) > } > > extern void check_and_update_goal(struct gfs2_inode *ip); > + > +extern void rgrp_lock_local(struct gfs2_rgrpd *rgd); > +extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd); > + > #endif /* __RGRP_DOT_H__ */