From mboxrd@z Thu Jan  1 00:00:00 1970
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 2 Feb 2016 11:53:49 +0000
Subject: [Cluster-devel] [GFS2 PATCH] GFS2: Queue final dlm unlock from
 gfs2_evict_inode
In-Reply-To: <853571928.36038478.1450464138537.JavaMail.zimbra@redhat.com>
References: <853571928.36038478.1450464138537.JavaMail.zimbra@redhat.com>
Message-ID: <56B098CD.9010201@redhat.com>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

Hi,

On 18/12/15 18:42, Bob Peterson wrote:
> Hi,
>
> This patch introduces a new glock workqueue, gfs2_glock_final. The
> workqueue merely does work to call dlm's unlock.
> This prevents gfs2_evict_inode from calling dlm directly which
> might block, waiting for DLM to unlock, which may be waiting for
> something like a fence operation. By moving it to its own work queue,
> the final put happens later, which allows the shrinker to continue,
> and free memory, avoiding a livelock.
I don't think this is going to work... if the DLM lock put is delayed 
then it would be possible for a new glock to be created (or the existing 
one to be reused) if the inode gets recreated, and then there is no 
coordination between the unlock and the new lock request. If those were 
to be sent in the wrong order, then things are going to get very confused.

If you add the coordination then we are back to square one, in terms of 
the deadlock in this loop. The only way we can stop this is to try and 
avoid inode deallocation in the eviction path. By the time we've already 
decided to do the eviction, it is too late,

Steve.

> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 795c2f3..9d0f3d5 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
>   
>   static struct dentry *gfs2_root;
>   static struct workqueue_struct *glock_workqueue;
> +static struct workqueue_struct *gfs2_final_workqueue;
>   struct workqueue_struct *gfs2_delete_workqueue;
>   static LIST_HEAD(lru_list);
>   static atomic_t lru_count = ATOMIC_INIT(0);
> @@ -152,6 +153,20 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
>   	spin_unlock(&lru_lock);
>   }
>   
> +/* The purpose of this function is to tell dlm when a glock is not needed
> + * We can't do this directly from gfs2_glock_put because dlm may block while
> + * waiting for a fence operation to complete. But the fence may block on
> + * memory allocation, which may block on the shrinker, which may block on
> + * the evict code. So the buck stops here.
> + */
> +static void final_work_func(struct work_struct *work)
> +{
> +	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_final);
> +	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> +
> +	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
> +}
> +
>   /**
>    * gfs2_glock_put() - Decrement reference count on glock
>    * @gl: The glock to put
> @@ -160,7 +175,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
>   
>   void gfs2_glock_put(struct gfs2_glock *gl)
>   {
> -	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
>   	struct address_space *mapping = gfs2_glock2aspace(gl);
>   
>   	if (lockref_put_or_lock(&gl->gl_lockref))
> @@ -174,7 +188,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
>   	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
>   	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
>   	trace_gfs2_glock_put(gl);
> -	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
> +	BUG_ON(queue_work(gfs2_final_workqueue, &gl->gl_final) == 0);
>   }
>   
>   /**
> @@ -700,6 +714,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
>   	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
>   	INIT_WORK(&gl->gl_delete, delete_work_func);
> +	INIT_WORK(&gl->gl_final, final_work_func);
>   
>   	mapping = gfs2_glock2aspace(gl);
>   	if (mapping) {
> @@ -1772,6 +1787,14 @@ int __init gfs2_glock_init(void)
>   		rhashtable_destroy(&gl_hash_table);
>   		return -ENOMEM;
>   	}
> +	gfs2_final_workqueue = alloc_workqueue("final_workqueue",
> +					       WQ_MEM_RECLAIM | WQ_HIGHPRI |
> +					       WQ_FREEZABLE, 0);
> +	if (IS_ERR(gfs2_final_workqueue)) {
> +		destroy_workqueue(glock_workqueue);
> +		destroy_workqueue(gfs2_delete_workqueue);
> +		return PTR_ERR(gfs2_final_workqueue);
> +	}
>   
>   	register_shrinker(&glock_shrinker);
>   
> @@ -1784,6 +1807,7 @@ void gfs2_glock_exit(void)
>   	rhashtable_destroy(&gl_hash_table);
>   	destroy_workqueue(glock_workqueue);
>   	destroy_workqueue(gfs2_delete_workqueue);
> +	destroy_workqueue(gfs2_final_workqueue);
>   }
>   
>   static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index a6a3389..1b63fbc 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -361,6 +361,7 @@ struct gfs2_glock {
>   	atomic_t gl_ail_count;
>   	atomic_t gl_revokes;
>   	struct delayed_work gl_work;
> +	struct work_struct gl_final;
>   	union {
>   		/* For inode and iopen glocks only */
>   		struct work_struct gl_delete;
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 0357862..4232368 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -126,6 +126,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
>   		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
>   		if (unlikely(error))
>   			goto fail;
> +
> +		flush_work(&ip->i_gl->gl_final);
>   		ip->i_gl->gl_object = ip;
>   
>   		error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
> @@ -189,6 +191,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
>   	if (error)
>   		return ERR_PTR(error);
>   
> +	flush_work(&i_gh.gh_gl->gl_final);
>   	error = gfs2_check_blk_type(sdp, no_addr, blktype);
>   	if (error)
>   		goto fail;
> @@ -681,6 +684,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>   	if (error)
>   		goto fail_free_inode;
>   
> +	flush_work(&ip->i_gl->gl_final);
>   	ip->i_gl->gl_object = ip;
>   	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
>   	if (error)
>