cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
From: Bob Peterson <rpeterso@redhat.com>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] [PATCH 10/11] GFS2: Rework transition from unlinked to deleted dinodes
Date: Thu, 10 Sep 2015 14:49:50 -0500	[thread overview]
Message-ID: <1441914591-11949-11-git-send-email-rpeterso@redhat.com> (raw)
In-Reply-To: <1441914591-11949-1-git-send-email-rpeterso@redhat.com>

This patch reworks the glock's delete_work_func and function
gfs2_delete_inode so that dinodes make a proper transition from the
unlinked state to the free state. The biggest problem is that there
was a huge timing window where a node gets a callback from DLM to
delete an unlinked dinode, but it cannot do so because the inode in
memory has already been freed or is in the I_FREEING state, and
therefore not found. A previous patch allowed GFS2 to create a new
inode in those cases, but it can only do so after it waits for the
duplicate to actually be freed. This waiting caused major problems
with deadlocks due to lock ordering problems involving the inode and
its two glocks: i_gl and i_iopen_gl. This patch reworks that whole
area of code to avoid those deadlocks and still give us the proper
transition to free up unlinked dinodes.
---
 fs/gfs2/dir.c        |   2 +-
 fs/gfs2/glock.c      |   9 ++--
 fs/gfs2/incore.h     |   1 +
 fs/gfs2/inode.c      | 141 +++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/gfs2/inode.h      |   5 +-
 fs/gfs2/main.c       |   2 +
 fs/gfs2/meta_io.c    |   6 +++
 fs/gfs2/ops_fstype.c |   2 +-
 fs/gfs2/super.c      |  52 ++++++++++++++-----
 fs/gfs2/xattr.c      |   6 +++
 10 files changed, 201 insertions(+), 25 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 487527b..f4caf2f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1563,7 +1563,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
 		brelse(bh);
 		if (fail_on_exist)
 			return ERR_PTR(-EEXIST);
-		return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+		return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino);
 	}
 	return ERR_PTR(-ENOENT);
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e5dfbcd..59f089c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -569,16 +569,17 @@ static void delete_work_func(struct work_struct *work)
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct gfs2_inode *ip;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	u64 no_addr = gl->gl_name.ln_number;
 
 	ip = gl->gl_object;
 	/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
-
 	if (ip)
 		inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
-	else
-		inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
+
+	if (inode == NULL || IS_ERR(inode))
+		inode = gfs2_inode_lookup_for_del(sdp->sd_vfs, no_addr);
+
 	if (inode && !IS_ERR(inode)) {
 		d_prune_aliases(inode);
 		iput(inode);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 5065e0c..3cf0224 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -381,6 +381,7 @@ enum {
 	GIF_SW_PAGED		= 3,
 	GIF_ORDERED		= 4,
 	GIF_FREE_VFS_INODE      = 5,
+	GIF_AM_DELETING         = 6,
 };
 
 struct gfs2_inode {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 06c208b..631d4ef 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -78,15 +78,15 @@ static void gfs2_set_iop(struct inode *inode)
 /**
  * gfs2_inode_lookup - Lookup an inode
  * @sb: The super block
- * @no_addr: The inode number
  * @type: The type of the inode
- * non_block: Can we block on inodes that are being freed?
+ * @no_addr: The inode number
+ * @no_formal_ino: The formal inode number (for NFS)
  *
  * Returns: A VFS inode, or an error
  */
 
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
-				u64 no_addr, u64 no_formal_ino, int non_block)
+				u64 no_addr, u64 no_formal_ino)
 {
 	struct inode *inode;
 	struct gfs2_inode *ip;
@@ -153,6 +153,139 @@ fail:
 	return ERR_PTR(error);
 }
 
+/**
+ * gfs2_inode_lookup_for_del - Lookup an unlinked inode so we can delete it
+ * @sb: The super block
+ * @no_addr: The inode number
+ *
+ * Returns: A VFS inode, or an error
+ *
+ * We jump through some hoops here to avoid a special case in which the block
+ * has been freed and already reallocated for a different inode while after
+ * the iopen callback was received to signify a remote unlink that needs
+ * deleting. In that case, we don't want to return the inode to the caller
+ * to delete the inode. We also need to do an inode_refresh to ensure that
+ * whomever recreated the dinode gets a proper i_nlink count, otherwise the
+ * vfs might think it's unlinked and still needs deleting.
+ */
+struct inode *gfs2_inode_lookup_for_del(struct super_block *sb, u64 no_addr)
+{
+	struct inode *inode;
+	struct gfs2_inode *ip;
+	struct gfs2_glock *io_gl = NULL;
+	struct gfs2_holder i_gh;
+	int error, btype;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct address_space *mapping;
+	BUG_ON(no_addr == 0);
+	inode = iget_locked(sb, (unsigned long)no_addr);
+	ip = GFS2_I(inode);
+
+	if (!inode)
+		return ERR_PTR(-ENOBUFS);
+
+	if (!(inode->i_state & I_NEW)) {
+		BUG_ON(ip->i_no_addr != no_addr);
+		return inode;
+	}
+
+	ip->i_no_addr = no_addr;
+
+	error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE,
+			       &ip->i_gl);
+	if (unlikely(error))
+		goto fail;
+	/* If this is a brand new inode, but a recycled glock, we've got a
+	   reference problem. In clear_inode it does an extra glock_put so the
+	   next time we do clear_inode for this inode, we'll get in trouble.
+	   So we hold the glock to bump the reference count. */
+	if (ip->i_gl->gl_flags != 0) /* New inode, recycled glock */
+		ip->i_gl->gl_lockref.count++;
+
+	ip->i_gl->gl_object = ip;
+
+	error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE,
+			       &io_gl);
+	if (unlikely(error))
+		goto fail_put;
+
+	ip->i_iopen_gh.gh_gl = NULL;
+	gfs2_glock_put(io_gl);
+	io_gl = NULL;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &i_gh);
+	if (error)
+		goto fail_put;
+
+	/* Inode glock must be locked already */
+	btype = gfs2_get_blk_type(sdp, no_addr);
+	if (btype < 0) {
+		error = btype;
+		goto fail_refresh;
+	}
+	if (btype == GFS2_BLKST_FREE || btype == GFS2_BLKST_USED) {
+		error = -ESTALE;
+		goto fail_refresh;
+	}
+
+	/* At this point it's either UNLINKED or DINODE */
+
+	/* If there aren't any pages associated with it and it's a new inode,
+	 * we shouldn't be messing with it, even to read in pages. We should
+	 * just exit and let whomever's using it carry on. If we do inode
+	 * refresh, we'll end up adding pages to the cache that we'd otherwise
+	 * need to truncate with truncate_inode_page().
+	 */
+	mapping = gfs2_glock2aspace(ip->i_gl);
+	if (mapping->nrpages == 0 && btype == GFS2_BLKST_DINODE) {
+		error = -ESTALE;
+		goto fail_refresh;
+	}
+	/* At this point, we've got a dinode with pages associated with it
+	 * or it's unlinked. If it's unlinked, we need to do inode_refresh
+	 * so that our put() will delete it normally through gfs2_delete_inode.
+	 * If it has pages associated with it, we may have grabbed the glock
+	 * while it was being created, so we need to refresh it before
+	 * exiting.
+	 */
+	error = gfs2_inode_refresh(GFS2_I(inode));
+	if (error)
+		goto fail_refresh;
+
+	if (btype == GFS2_BLKST_DINODE) {
+		/* Now we know this is a dinode (not unlinked), but we know
+		 * there were already pages associated with it. So it's safe
+		 * to exit:
+		 */
+		error = -ESTALE;
+		goto fail_refresh;
+	}
+
+	gfs2_set_iop(inode);
+	unlock_new_inode(inode);
+	gfs2_glock_dq_uninit(&i_gh);
+	gfs2_glock_put(ip->i_gl);
+	return inode;
+
+fail_refresh:
+	ip->i_gl->gl_object = NULL; /* See note below */
+	gfs2_glock_dq_uninit(&i_gh);
+fail_put:
+	/* This setting of gl_object to NULL is done by the other lookup
+	 * functions. But if it races with someone reusing the dinode, we
+	 * don't want to mess them up. It seems necessary in order to prevent
+	 * buffer_heads from being attached after the i_gh is acquired.
+	 * But it seems like it has the potential to screw up people trying
+	 * to re-use the glock for a new incarnation of the inode.
+	 * For now, I'm going to move it inside the dq_uninit.
+	 */
+	/*ip->i_gl->gl_object = NULL;*/
+	gfs2_glock_put(ip->i_gl);
+fail:
+	iget_failed(inode);
+	return ERR_PTR(error);
+}
+
 struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 				  u64 *no_formal_ino, unsigned int blktype)
 {
@@ -171,7 +304,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 	if (error)
 		goto fail;
 
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
 	if (IS_ERR(inode))
 		goto fail;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 22c27a8..9e838b3 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -94,8 +94,9 @@ err:
 }
 
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-				       u64 no_addr, u64 no_formal_ino,
-				       int non_block);
+				       u64 no_addr, u64 no_formal_ino);
+extern struct inode *gfs2_inode_lookup_for_del(struct super_block *sb,
+					       u64 no_addr);
 extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 					 u64 *no_formal_ino,
 					 unsigned int blktype);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 241a399..a8bd5b6 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -43,6 +43,8 @@ static void gfs2_init_inode_once(void *foo)
 	INIT_LIST_HEAD(&ip->i_trunc_list);
 	ip->i_res = NULL;
 	ip->i_hash_cache = NULL;
+	ip->i_iopen_gh.gh_gl = NULL;
+	INIT_LIST_HEAD(&ip->i_iopen_gh.gh_list);
 }
 
 static void gfs2_init_glock_once(void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0e1d4be..f5b4948 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -128,6 +128,12 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 	index = blkno >> shift;             /* convert block to page */
 	bufnum = blkno - (index << shift);  /* block buf index within page */
 
+	if ((gl->gl_name.ln_type == LM_TYPE_INODE) && gl->gl_object) {
+		struct gfs2_inode *ip = gl->gl_object;
+		struct inode *inode = &ip->i_inode;
+		if (!test_bit(GIF_AM_DELETING, &ip->i_flags))
+			BUG_ON(inode->i_state & I_FREEING);
+	}
 	if (create) {
 		for (;;) {
 			page = grab_cache_page(mapping, index);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1e3a93f..f6db0da 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -451,7 +451,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
 	struct dentry *dentry;
 	struct inode *inode;
 
-	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
 	if (IS_ERR(inode)) {
 		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
 		return PTR_ERR(inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2f29ca5..7b59a44 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1511,6 +1511,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
+	struct gfs2_glock *io_gl = NULL, *i_gl = NULL;
 	struct address_space *metamapping;
 	int error;
 
@@ -1522,8 +1523,13 @@ static void gfs2_evict_inode(struct inode *inode)
 	if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
 		goto out;
 
+	error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops,
+			       CREATE, &i_gl);
+	if (unlikely(error))
+		goto out;
+
 	/* Must not read inode block until block type has been verified */
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
+	error = gfs2_glock_nq_init(i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
 	if (unlikely(error)) {
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 		gfs2_glock_dq_wait(&ip->i_iopen_gh);
@@ -1531,10 +1537,13 @@ static void gfs2_evict_inode(struct inode *inode)
 		goto out;
 	}
 
+	set_bit(GIF_AM_DELETING, &ip->i_flags);
 	if (!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
 		error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
-		if (error)
-			goto out_truncate;
+		if (error) {
+			gfs2_glock_dq_uninit(&gh);
+			goto out;
+		}
 	}
 
 	if (test_bit(GIF_INVALID, &ip->i_flags)) {
@@ -1543,10 +1552,22 @@ static void gfs2_evict_inode(struct inode *inode)
 			goto out_truncate;
 	}
 
+	/* The iopen glock may have been released prior to this, but we need
+	   it back. */
+	error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE,
+			       &io_gl);
+	if (error)
+		goto out_truncate;
+
 	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-	gfs2_glock_dq_wait(&ip->i_iopen_gh);
-	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
-	error = gfs2_glock_nq(&ip->i_iopen_gh);
+	if (ip->i_iopen_gh.gh_gl &&
+	    test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+		gfs2_glock_dq_wait(&ip->i_iopen_gh);
+		gfs2_holder_uninit(&ip->i_iopen_gh);
+	}
+
+	error = gfs2_glock_nq_init(io_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB |
+				   GL_NOCACHE, &ip->i_iopen_gh);
 	if (error)
 		goto out_truncate;
 
@@ -1575,14 +1596,14 @@ static void gfs2_evict_inode(struct inode *inode)
 	goto out_unlock;
 
 out_truncate:
-	gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
-	metamapping = gfs2_glock2aspace(ip->i_gl);
-	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
+	gfs2_log_flush(sdp, i_gl, NORMAL_FLUSH);
+	metamapping = gfs2_glock2aspace(i_gl);
+	if (test_bit(GLF_DIRTY, &i_gl->gl_flags)) {
 		filemap_fdatawrite(metamapping);
 		filemap_fdatawait(metamapping);
 	}
 	write_inode_now(inode, 1);
-	gfs2_ail_flush(ip->i_gl, 0);
+	gfs2_ail_flush(i_gl, 0);
 
 	/* Case 2 starts here */
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
@@ -1598,11 +1619,15 @@ out_unlock:
 	if (gfs2_rs_active(ip->i_res))
 		gfs2_rs_deltree(ip->i_res);
 
-	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+	if (ip->i_iopen_gh.gh_gl &&
+	    test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 		gfs2_glock_dq_wait(&ip->i_iopen_gh);
+		gfs2_holder_uninit(&ip->i_iopen_gh);
 	}
-	gfs2_holder_uninit(&ip->i_iopen_gh);
+	if (io_gl)
+		gfs2_glock_put(io_gl);
+
 	gfs2_glock_dq_uninit(&gh);
 	if (error && error != GLR_TRYFAILED && error != -EROFS)
 		fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
@@ -1616,7 +1641,8 @@ out:
 	ip->i_gl->gl_object = NULL;
 	flush_delayed_work(&ip->i_gl->gl_work);
 	gfs2_glock_add_to_lru(ip->i_gl);
-	gfs2_glock_put(ip->i_gl);
+	if (i_gl)
+		gfs2_glock_put(i_gl);
 	ip->i_gl = NULL;
 	if (ip->i_iopen_gh.gh_gl) {
 		ip->i_iopen_gh.gh_gl->gl_object = NULL;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4c096fa..e436ad0 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -625,6 +625,8 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	u64 block;
 	int error;
 
+	BUG_ON((!gfs2_glock_is_locked_by_me(ip->i_gl)) ||
+	       (!gfs2_glock_is_held_excl(ip->i_gl)));
 	error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 	if (error)
 		return error;
@@ -687,6 +689,8 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			int mh_size = sizeof(struct gfs2_meta_header);
 			unsigned int n = 1;
 
+			BUG_ON((!gfs2_glock_is_locked_by_me(ip->i_gl)) ||
+			       (!gfs2_glock_is_held_excl(ip->i_gl)));
 			error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 			if (error)
 				return error;
@@ -1003,6 +1007,8 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	} else {
 		u64 blk;
 		unsigned int n = 1;
+		BUG_ON((!gfs2_glock_is_locked_by_me(ip->i_gl)) ||
+		       (!gfs2_glock_is_held_excl(ip->i_gl)));
 		error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL);
 		if (error)
 			return error;
-- 
2.4.3



  parent reply	other threads:[~2015-09-10 19:49 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-10 19:49 [Cluster-devel] [PATCH 00/11] Eleven patches related to file unlink->delete->new Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 01/11] GFS2: Update master statfs buffer with sd_statfs_spin locked Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 02/11] GFS2: Allow fail_gunlock3 to set the free_vfs_inode bit Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 03/11] GFS2: Protect log tail calculations with inside locks Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 04/11] GFS2: Wait for iopen glock dequeues Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 05/11] GFS2: Reintroduce a timeout in function gfs2_gl_hash_clear Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 06/11] GFS2: Prevent gl_delete work for re-used inodes Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 07/11] GFS2: Truncate address space mapping when deleting an inode Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 08/11] GFS2: Don't filter out I_FREEING inodes anymore Bob Peterson
2015-09-10 19:49 ` [Cluster-devel] [PATCH 09/11] GFS2: generalize gfs2_check_blk_type Bob Peterson
2015-09-10 19:49 ` Bob Peterson [this message]
2015-09-10 19:49 ` [Cluster-devel] [PATCH 11/11] GFS2: Change from tr_touched to tr_bufs Bob Peterson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1441914591-11949-11-git-send-email-rpeterso@redhat.com \
    --to=rpeterso@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).