[Cluster-devel] [PATCH v2 4/6] gfs2: Add local resource group locking

cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed

From: Andreas Gruenbacher <agruenba@redhat.com>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] [PATCH v2 4/6] gfs2: Add local resource group locking
Date: Sat,  1 Dec 2018 12:10:17 +0100	[thread overview]
Message-ID: <20181201111019.14363-5-agruenba@redhat.com> (raw)
In-Reply-To: <20181201111019.14363-1-agruenba@redhat.com>

From: Bob Peterson <rpeterso@redhat.com>

Prepare for treating resource group glocks as exclusive among nodes but
shared among all tasks running on a node: introduce another layer of
node-specific locking that the local tasks can use to coordinate their
accesses.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/incore.h |  2 ++
 fs/gfs2/lops.c   |  5 +++-
 fs/gfs2/rgrp.c   | 74 ++++++++++++++++++++++++++++++++++++++++++------
 fs/gfs2/rgrp.h   |  4 +++
 4 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ca25043fc26df..feba57a2a6bab 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -23,6 +23,7 @@
 #include <linux/percpu.h>
 #include <linux/lockref.h>
 #include <linux/rhashtable.h>
+#include <linux/mutex.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -121,6 +122,7 @@ struct gfs2_rgrpd {
 #define GFS2_RDF_PREFERRED	0x80000000 /* This rgrp is preferred */
 #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
 	spinlock_t rd_rsspin;           /* protects reservation related vars */
+	struct mutex rd_mutex;
 	struct rb_root rd_rstree;       /* multi-block reservation tree */
 };
 
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4c7069b8f3c1d..a9e858e01c97f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -76,8 +76,9 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
 	unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
 	struct gfs2_bitmap *bi = rgd->rd_bits + index;
 
+	rgrp_lock_local(rgd);
 	if (bi->bi_clone == NULL)
-		return;
+		goto out;
 	if (sdp->sd_args.ar_discard)
 		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
 	memcpy(bi->bi_clone + bi->bi_offset,
@@ -85,6 +86,8 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
 	clear_bit(GBF_FULL, &bi->bi_flags);
 	rgd->rd_free_clone = rgd->rd_free;
 	rgd->rd_extfail_pt = rgd->rd_free;
+out:
+	rgrp_unlock_local(rgd);
 }
 
 /**
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 1f427459a584d..95ecd81e61e70 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -952,6 +952,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 	rgd->rd_data = be32_to_cpu(buf.ri_data);
 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
 	spin_lock_init(&rgd->rd_rsspin);
+	mutex_init(&rgd->rd_mutex);
 
 	error = compute_bitstructs(rgd);
 	if (error)
@@ -1472,9 +1473,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 			/* Trim each bitmap in the rgrp */
 			for (x = 0; x < rgd->rd_length; x++) {
 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
+				rgrp_lock_local(rgd);
 				ret = gfs2_rgrp_send_discards(sdp,
 						rgd->rd_data0, NULL, bi, minlen,
 						&amt);
+				rgrp_unlock_local(rgd);
 				if (ret) {
 					gfs2_glock_dq_uninit(&gh);
 					goto out;
@@ -1486,9 +1489,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
 			if (ret == 0) {
 				bh = rgd->rd_bits[0].bi_bh;
+				rgrp_lock_local(rgd);
 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
 				gfs2_trans_add_meta(rgd->rd_gl, bh);
 				gfs2_rgrp_out(rgd, bh->b_data);
+				rgrp_unlock_local(rgd);
 				gfs2_trans_end(sdp);
 			}
 		}
@@ -1864,7 +1869,22 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
 
 	while (1) {
+		/*
+		 * We must be careful to avoid deadlock here:
+		 * All transactions expect: sd_log_flush_lock followed by rgrp
+		 * ex (if needed), but try_rgrp_unlink takes sd_log_flush_lock
+		 * outside a transaction and therefore must not have the rgrp
+		 * ex already held. To avoid deadlock, we drop the rgrp ex lock
+		 * before taking the log_flush_lock, then reacquire it to
+		 * protect our call to gfs2_rbm_find.
+		 *
+		 * Also note that rgrp_unlock_local must come AFTER the caller does
+		 * gfs2_rs_deltree because rgrp ex needs to be held before
+		 * making reservations.
+		 */
+		rgrp_unlock_local(rgd);
 		down_write(&sdp->sd_log_flush_lock);
+		rgrp_lock_local(rgd);
 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
 				      true);
 		up_write(&sdp->sd_log_flush_lock);
@@ -2062,7 +2082,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *begin = NULL;
 	struct gfs2_blkreserv *rs = &ip->i_res;
-	int error = 0, rg_locked, flags = 0;
+	int error = 0, flags = 0;
+	bool rg_locked;
 	u64 last_unlinked = NO_BLOCK;
 	int loops = 0;
 	u32 free_blocks, skip = 0;
@@ -2088,10 +2109,10 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 		return -EBADSLT;
 
 	while (loops < 3) {
-		rg_locked = 1;
-
-		if (!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
-			rg_locked = 0;
+		rg_locked = gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl);
+		if (rg_locked) {
+			rgrp_lock_local(rs->rs_rgd);
+		} else {
 			if (skip && skip--)
 				goto next_rgrp;
 			if (!gfs2_rs_active(rs)) {
@@ -2108,12 +2129,14 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 						   &ip->i_rgd_gh);
 			if (unlikely(error))
 				return error;
+			rgrp_lock_local(rs->rs_rgd);
 			if (!gfs2_rs_active(rs) && (loops < 2) &&
 			    gfs2_rgrp_congested(rs->rs_rgd, loops))
 				goto skip_rgrp;
 			if (sdp->sd_args.ar_rgrplvb) {
 				error = update_rgrp_lvb(rs->rs_rgd);
 				if (unlikely(error)) {
+					rgrp_unlock_local(rs->rs_rgd);
 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
 					return error;
 				}
@@ -2147,9 +2170,10 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 			rs->rs_reserved = ap->target;
 			if (rs->rs_reserved > free_blocks)
 				rs->rs_reserved = free_blocks;
-			spin_lock(&rs->rs_rgd->rd_rsspin);
+			spin_lock(&rgd->rd_rsspin);
 			rgd->rd_reserved += rs->rs_reserved;
-			spin_unlock(&rs->rs_rgd->rd_rsspin);
+			spin_unlock(&rgd->rd_rsspin);
+			rgrp_unlock_local(rs->rs_rgd);
 			return 0;
 		}
 check_rgrp:
@@ -2158,6 +2182,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 			try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
 					ip->i_no_addr);
 skip_rgrp:
+		rgrp_unlock_local(rs->rs_rgd);
+
 		/* Drop reservation, if we couldn't use reserved rgrp */
 		if (gfs2_rs_active(rs))
 			gfs2_rs_deltree(rs);
@@ -2209,7 +2235,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 		spin_lock(&rgd->rd_rsspin);
 		BUG_ON(rgd->rd_reserved < rs->rs_reserved);
 		rgd->rd_reserved -= rs->rs_reserved;
-		spin_unlock(&rs->rs_rgd->rd_rsspin);
+		spin_unlock(&rgd->rd_rsspin);
 		rs->rs_reserved = 0;
 	}
 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
@@ -2300,6 +2326,7 @@ void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 
 	if (rgd == NULL)
 		return;
+	spin_lock(&rgd->rd_rsspin);
 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
@@ -2312,7 +2339,6 @@ void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 			       be32_to_cpu(rgl->rl_free),
 			       be32_to_cpu(rgl->rl_dinodes));
 	}
-	spin_lock(&rgd->rd_rsspin);
 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
 		dump_rs(seq, trs);
@@ -2439,6 +2465,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 
 	BUG_ON(ip->i_res.rs_reserved < *nblocks);
 
+	rgrp_lock_local(rbm.rgd);
 	gfs2_set_alloc_start(&rbm, ip, dinode);
 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
 
@@ -2488,6 +2515,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 
 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
+	rgrp_unlock_local(rbm.rgd);
 
 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
 	if (dinode)
@@ -2501,6 +2529,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	return 0;
 
 rgrp_error:
+	rgrp_unlock_local(rbm.rgd);
 	gfs2_rgrp_error(rbm.rgd);
 	return -EIO;
 }
@@ -2520,12 +2549,14 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 
+	rgrp_lock_local(rgd);
 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
 	rgd->rd_free += blen;
 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+	rgrp_unlock_local(rgd);
 
 	/* Directories keep their data in the metadata address space */
 	if (meta || ip->i_depth)
@@ -2561,17 +2592,20 @@ void gfs2_unlink_di(struct inode *inode)
 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
 	if (!rgd)
 		return;
+	rgrp_lock_local(rgd);
 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
+	rgrp_unlock_local(rgd);
 }
 
 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 
+	rgrp_lock_local(rgd);
 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
 	if (!rgd->rd_dinodes)
 		gfs2_consist_rgrpd(rgd);
@@ -2580,6 +2614,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 
 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
+	rgrp_unlock_local(rgd);
 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
 
 	gfs2_statfs_change(sdp, 0, +1, -1);
@@ -2594,6 +2629,10 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
  * @no_addr: The block number to check
  * @type: The block type we are looking for
  *
+ * The inode glock of @no_addr must be held.  The @type to check for is either
+ * GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; checking for type GFS2_BLKST_FREE
+ * or GFS2_BLKST_USED would make no sense.
+ *
  * Returns: 0 if the block type matches the expected type
  *          -ESTALE if it doesn't match
  *          or -ve errno if something went wrong while checking
@@ -2619,6 +2658,12 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
 	if (WARN_ON_ONCE(error))
 		goto fail;
 
+	/*
+	 * No need to take the local resource group lock here; the inode glock
+	 * of @no_addr provides the necessary synchronization in case the block
+	 * is an inode.  (In case the block is not an inode, the block type
+	 * will not match the @type we are looking for.)
+	 */
 	if (gfs2_testbit(&rbm, false) != type)
 		error = -ESTALE;
 
@@ -2741,3 +2786,14 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
 	}
 }
 
+void rgrp_lock_local(struct gfs2_rgrpd *rgd)
+{
+	BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) &&
+	       !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags));
+	mutex_lock(&rgd->rd_mutex);
+}
+
+void rgrp_unlock_local(struct gfs2_rgrpd *rgd)
+{
+	mutex_unlock(&rgd->rd_mutex);
+}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b596c3d179888..33e52dab76efa 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -92,4 +92,8 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
 }
 
 extern void check_and_update_goal(struct gfs2_inode *ip);
+
+extern void rgrp_lock_local(struct gfs2_rgrpd *rgd);
+extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd);
+
 #endif /* __RGRP_DOT_H__ */
-- 
2.19.1.546.g028f9c799.dirty

next prev parent reply	other threads:[~2018-12-01 11:10 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-01 11:10 [Cluster-devel] [PATCH v2 0/6] gfs2: Prepare for resource group glock sharing Andreas Gruenbacher
2018-12-01 11:10 ` [Cluster-devel] [PATCH v2 1/6] gfs2: Only use struct gfs2_rbm for bitmap manipulations Andreas Gruenbacher
2018-12-01 11:10 ` [Cluster-devel] [PATCH v2 2/6] gfs2: Clean up gfs2_adjust_reservation Andreas Gruenbacher
2018-12-03 14:32   ` Bob Peterson
2018-12-03 14:58     ` Andreas Gruenbacher
2018-12-04 22:02   ` Andreas Gruenbacher
2018-12-01 11:10 ` [Cluster-devel] [PATCH v2 3/6] gfs2: Add per-reservation reserved block accounting Andreas Gruenbacher
2018-12-01 11:10 ` Andreas Gruenbacher [this message]
2018-12-01 11:10 ` [Cluster-devel] [PATCH v2 5/6] gfs2: Allow node-wide exclusive glock sharing Andreas Gruenbacher
2018-12-01 11:10 ` [Cluster-devel] [PATCH v2 6/6] gfs2: Introduce resource group sharing Andreas Gruenbacher

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:ca25043fc26d dfblob:feba57a2a6ba dfblob:4c7069b8f3c1
dfblob:a9e858e01c97 dfblob:1f427459a584 dfblob:95ecd81e61e7
dfblob:b596c3d17988 dfblob:33e52dab76ef )
 OR (
bs:"[Cluster-devel] [PATCH v2 4/6] gfs2: Add local resource group locking" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181201111019.14363-5-agruenba@redhat.com \
    --to=agruenba@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).