[Cluster-devel] [GFS2 PATCH] GFS2: Queue final dlm unlock from gfs2_evict

cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed

* [Cluster-devel] [GFS2 PATCH] GFS2: Queue final dlm unlock from gfs2_evict_inode
       [not found] <869029287.36038231.1450464080200.JavaMail.zimbra@redhat.com>
@ 2015-12-18 18:42 ` Bob Peterson
  2016-02-02 11:53   ` Steven Whitehouse
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Peterson @ 2015-12-18 18:42 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This patch introduces a new glock workqueue, gfs2_glock_final. The
workqueue merely does work to call dlm's unlock.
This prevents gfs2_evict_inode from calling dlm directly which
might block, waiting for DLM to unlock, which may be waiting for
something like a fence operation. By moving it to its own work queue,
the final put happens later, which allows the shrinker to continue,
and free memory, avoiding a livelock.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 795c2f3..9d0f3d5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
+static struct workqueue_struct *gfs2_final_workqueue;
 struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
@@ -152,6 +153,20 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 	spin_unlock(&lru_lock);
 }
 
+/* The purpose of this function is to tell dlm when a glock is not needed
+ * We can't do this directly from gfs2_glock_put because dlm may block while
+ * waiting for a fence operation to complete. But the fence may block on
+ * memory allocation, which may block on the shrinker, which may block on
+ * the evict code. So the buck stops here.
+ */
+static void final_work_func(struct work_struct *work)
+{
+	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_final);
+	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+}
+
 /**
  * gfs2_glock_put() - Decrement reference count on glock
  * @gl: The glock to put
@@ -160,7 +175,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 
 void gfs2_glock_put(struct gfs2_glock *gl)
 {
-	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct address_space *mapping = gfs2_glock2aspace(gl);
 
 	if (lockref_put_or_lock(&gl->gl_lockref))
@@ -174,7 +188,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
 	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
 	trace_gfs2_glock_put(gl);
-	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+	BUG_ON(queue_work(gfs2_final_workqueue, &gl->gl_final) == 0);
 }
 
 /**
@@ -700,6 +714,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
 	INIT_WORK(&gl->gl_delete, delete_work_func);
+	INIT_WORK(&gl->gl_final, final_work_func);
 
 	mapping = gfs2_glock2aspace(gl);
 	if (mapping) {
@@ -1772,6 +1787,14 @@ int __init gfs2_glock_init(void)
 		rhashtable_destroy(&gl_hash_table);
 		return -ENOMEM;
 	}
+	gfs2_final_workqueue = alloc_workqueue("final_workqueue",
+					       WQ_MEM_RECLAIM | WQ_HIGHPRI |
+					       WQ_FREEZABLE, 0);
+	if (IS_ERR(gfs2_final_workqueue)) {
+		destroy_workqueue(glock_workqueue);
+		destroy_workqueue(gfs2_delete_workqueue);
+		return PTR_ERR(gfs2_final_workqueue);
+	}
 
 	register_shrinker(&glock_shrinker);
 
@@ -1784,6 +1807,7 @@ void gfs2_glock_exit(void)
 	rhashtable_destroy(&gl_hash_table);
 	destroy_workqueue(glock_workqueue);
 	destroy_workqueue(gfs2_delete_workqueue);
+	destroy_workqueue(gfs2_final_workqueue);
 }
 
 static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a6a3389..1b63fbc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -361,6 +361,7 @@ struct gfs2_glock {
 	atomic_t gl_ail_count;
 	atomic_t gl_revokes;
 	struct delayed_work gl_work;
+	struct work_struct gl_final;
 	union {
 		/* For inode and iopen glocks only */
 		struct work_struct gl_delete;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0357862..4232368 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -126,6 +126,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
 		if (unlikely(error))
 			goto fail;
+
+		flush_work(&ip->i_gl->gl_final);
 		ip->i_gl->gl_object = ip;
 
 		error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
@@ -189,6 +191,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
 	if (error)
 		return ERR_PTR(error);
 
+	flush_work(&i_gh.gh_gl->gl_final);
 	error = gfs2_check_blk_type(sdp, no_addr, blktype);
 	if (error)
 		goto fail;
@@ -681,6 +684,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		goto fail_free_inode;
 
+	flush_work(&ip->i_gl->gl_final);
 	ip->i_gl->gl_object = ip;
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
 	if (error)



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Queue final dlm unlock from gfs2_evict_inode
  2015-12-18 18:42 ` [Cluster-devel] [GFS2 PATCH] GFS2: Queue final dlm unlock from gfs2_evict_inode Bob Peterson
@ 2016-02-02 11:53   ` Steven Whitehouse
  2016-02-10 19:59     ` [Cluster-devel] [GFS2 PATCH] GFS2: Delay glock disposal by way of a rubbish list Bob Peterson
  0 siblings, 1 reply; 4+ messages in thread
From: Steven Whitehouse @ 2016-02-02 11:53 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 18/12/15 18:42, Bob Peterson wrote:
> Hi,
>
> This patch introduces a new glock workqueue, gfs2_glock_final. The
> workqueue merely does work to call dlm's unlock.
> This prevents gfs2_evict_inode from calling dlm directly which
> might block, waiting for DLM to unlock, which may be waiting for
> something like a fence operation. By moving it to its own work queue,
> the final put happens later, which allows the shrinker to continue,
> and free memory, avoiding a livelock.
I don't think this is going to work... if the DLM lock put is delayed 
then it would be possible for a new glock to be created (or the existing 
one to be reused) if the inode gets recreated, and then there is no 
coordination between the unlock and the new lock request. If those were 
to be sent in the wrong order, then things are going to get very confused.

If you add the coordination then we are back to square one, in terms of 
the deadlock in this loop. The only way we can stop this is to try and 
avoid inode deallocation in the eviction path. By the time we've already 
decided to do the eviction, it is too late,

Steve.

> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
> ---
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 795c2f3..9d0f3d5 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
>   
>   static struct dentry *gfs2_root;
>   static struct workqueue_struct *glock_workqueue;
> +static struct workqueue_struct *gfs2_final_workqueue;
>   struct workqueue_struct *gfs2_delete_workqueue;
>   static LIST_HEAD(lru_list);
>   static atomic_t lru_count = ATOMIC_INIT(0);
> @@ -152,6 +153,20 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
>   	spin_unlock(&lru_lock);
>   }
>   
> +/* The purpose of this function is to tell dlm when a glock is not needed
> + * We can't do this directly from gfs2_glock_put because dlm may block while
> + * waiting for a fence operation to complete. But the fence may block on
> + * memory allocation, which may block on the shrinker, which may block on
> + * the evict code. So the buck stops here.
> + */
> +static void final_work_func(struct work_struct *work)
> +{
> +	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_final);
> +	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> +
> +	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
> +}
> +
>   /**
>    * gfs2_glock_put() - Decrement reference count on glock
>    * @gl: The glock to put
> @@ -160,7 +175,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
>   
>   void gfs2_glock_put(struct gfs2_glock *gl)
>   {
> -	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
>   	struct address_space *mapping = gfs2_glock2aspace(gl);
>   
>   	if (lockref_put_or_lock(&gl->gl_lockref))
> @@ -174,7 +188,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
>   	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
>   	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
>   	trace_gfs2_glock_put(gl);
> -	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
> +	BUG_ON(queue_work(gfs2_final_workqueue, &gl->gl_final) == 0);
>   }
>   
>   /**
> @@ -700,6 +714,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
>   	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
>   	INIT_WORK(&gl->gl_delete, delete_work_func);
> +	INIT_WORK(&gl->gl_final, final_work_func);
>   
>   	mapping = gfs2_glock2aspace(gl);
>   	if (mapping) {
> @@ -1772,6 +1787,14 @@ int __init gfs2_glock_init(void)
>   		rhashtable_destroy(&gl_hash_table);
>   		return -ENOMEM;
>   	}
> +	gfs2_final_workqueue = alloc_workqueue("final_workqueue",
> +					       WQ_MEM_RECLAIM | WQ_HIGHPRI |
> +					       WQ_FREEZABLE, 0);
> +	if (IS_ERR(gfs2_final_workqueue)) {
> +		destroy_workqueue(glock_workqueue);
> +		destroy_workqueue(gfs2_delete_workqueue);
> +		return PTR_ERR(gfs2_final_workqueue);
> +	}
>   
>   	register_shrinker(&glock_shrinker);
>   
> @@ -1784,6 +1807,7 @@ void gfs2_glock_exit(void)
>   	rhashtable_destroy(&gl_hash_table);
>   	destroy_workqueue(glock_workqueue);
>   	destroy_workqueue(gfs2_delete_workqueue);
> +	destroy_workqueue(gfs2_final_workqueue);
>   }
>   
>   static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index a6a3389..1b63fbc 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -361,6 +361,7 @@ struct gfs2_glock {
>   	atomic_t gl_ail_count;
>   	atomic_t gl_revokes;
>   	struct delayed_work gl_work;
> +	struct work_struct gl_final;
>   	union {
>   		/* For inode and iopen glocks only */
>   		struct work_struct gl_delete;
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 0357862..4232368 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -126,6 +126,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
>   		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
>   		if (unlikely(error))
>   			goto fail;
> +
> +		flush_work(&ip->i_gl->gl_final);
>   		ip->i_gl->gl_object = ip;
>   
>   		error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
> @@ -189,6 +191,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
>   	if (error)
>   		return ERR_PTR(error);
>   
> +	flush_work(&i_gh.gh_gl->gl_final);
>   	error = gfs2_check_blk_type(sdp, no_addr, blktype);
>   	if (error)
>   		goto fail;
> @@ -681,6 +684,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
>   	if (error)
>   		goto fail_free_inode;
>   
> +	flush_work(&ip->i_gl->gl_final);
>   	ip->i_gl->gl_object = ip;
>   	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
>   	if (error)
>



^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Delay glock disposal by way of a rubbish list
  2016-02-02 11:53   ` Steven Whitehouse
@ 2016-02-10 19:59     ` Bob Peterson
  2016-02-11 15:34       ` Steven Whitehouse
  0 siblings, 1 reply; 4+ messages in thread
From: Bob Peterson @ 2016-02-10 19:59 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

----- Original Message -----
> Hi,
> 
> On 18/12/15 18:42, Bob Peterson wrote:
> > Hi,
> >
> > This patch introduces a new glock workqueue, gfs2_glock_final. The
> > workqueue merely does work to call dlm's unlock.
> > This prevents gfs2_evict_inode from calling dlm directly which
> > might block, waiting for DLM to unlock, which may be waiting for
> > something like a fence operation. By moving it to its own work queue,
> > the final put happens later, which allows the shrinker to continue,
> > and free memory, avoiding a livelock.
> I don't think this is going to work... if the DLM lock put is delayed
> then it would be possible for a new glock to be created (or the existing
> one to be reused) if the inode gets recreated, and then there is no
> coordination between the unlock and the new lock request. If those were
> to be sent in the wrong order, then things are going to get very confused.
> 
> If you add the coordination then we are back to square one, in terms of
> the deadlock in this loop. The only way we can stop this is to try and
> avoid inode deallocation in the eviction path. By the time we've already
> decided to do the eviction, it is too late,
> 
> Steve.

Okay, I scrapped that idea and decided to do something very similar to
what GFS1 did. That is, to use a daemon to eliminate glocks that are no
longer needed. Rather than introduce a new daemon, I'm adding new function
to the existing quota daemon. This version seems to be running just as
well with the scenario I've been struggling with for so long.

What do you think? If we can't do this, I'm out of ideas on how to solve it.

Regards,

Bob Peterson
Red Hat File Systems
---
Patch description:

This patch adds a new rubbish list for glocks. When a glock's reference
count goes to zero, it gets thrown on the rubbish list, rather than
being freed and unlocked in dlm immediately. If we tell dlm to unlock
immediately, we run the risk of a livelock in which GFS2 waits forever
for DLM to release the lock, DLM waits forever for a pending fence
action, but the fence action waits forever for memory, which is
itself blocks on the inode shrinker, which is trying to free inodes.

With this new method, the glock gets tossed onto the rubbish list,
then the inode free may continue. Thus, the shrinker may continue
running, avoiding the live lock. The glocks on the rubbish list are
processed periodically by the quota daemon, which is very similar to
what GFS1 used to do (but GFS1 had several daemons; one was dedicated
to this process).

When the daemon processes the rubbish list, it first checks to see
if the glock has been recreated in the hash tables. If so, it can't
instruct DLM to unlock it; the new lock supercedes the dead one.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8dedece..703a4fd 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -67,6 +67,8 @@ struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(lru_lock);
+static LIST_HEAD(rubbish_list);
+static DEFINE_SPINLOCK(rubbish_lock);
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -160,7 +162,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 
 void gfs2_glock_put(struct gfs2_glock *gl)
 {
-	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct address_space *mapping = gfs2_glock2aspace(gl);
 
 	if (lockref_put_or_lock(&gl->gl_lockref))
@@ -173,8 +174,44 @@ void gfs2_glock_put(struct gfs2_glock *gl)
 	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
 	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-	trace_gfs2_glock_put(gl);
-	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+	spin_lock(&rubbish_lock);
+	list_add_tail(&gl->gl_lru, &rubbish_list);
+	spin_unlock(&rubbish_lock);
+}
+
+/**
+ * gfs2_rubbish_removal - destroy glocks that are no longer wanted
+ */
+long gfs2_rubbish_removal(long count)
+{
+	struct gfs2_glock *gl;
+	struct gfs2_sbd *sdp;
+	long removed = 0;
+
+	spin_lock(&rubbish_lock);
+	while(count-- && !list_empty(&rubbish_list)) {
+		gl = list_entry(rubbish_list.next, struct gfs2_glock, gl_lru);
+		BUG_ON(!__lockref_is_dead(&gl->gl_lockref));
+		sdp = gl->gl_name.ln_sbd;
+		list_del_init(&gl->gl_lru);
+		spin_unlock(&rubbish_lock);
+
+		/* If it was recreated after being sacked, don't tell dlm to
+		   unlock. The new one supercedes the one in the rubbish. */
+		if (rhashtable_lookup_fast(&gl_hash_table, &gl->gl_name,
+					   ht_parms)) {
+			gfs2_glock_free(gl);
+		} else { /* was not recreated */
+			trace_gfs2_glock_put(gl);
+			sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+		}
+		removed++;
+		if (count && !list_empty(&rubbish_list))
+			cond_resched();
+		spin_lock(&rubbish_lock);
+	}
+	spin_unlock(&rubbish_lock);
+	return removed;
 }
 
 /**
@@ -1503,13 +1540,21 @@ static void dump_glock_func(struct gfs2_glock *gl)
 
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
+	long removed;
+	int i;
+
 	set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
 	flush_workqueue(glock_workqueue);
 	glock_hash_walk(clear_glock, sdp);
 	flush_workqueue(glock_workqueue);
-	wait_event_timeout(sdp->sd_glock_wait,
-			   atomic_read(&sdp->sd_glock_disposal) == 0,
-			   HZ * 600);
+	for (i = 0; i < 600; i++) {
+		removed = gfs2_rubbish_removal(LONG_MAX);
+		wait_event_timeout(sdp->sd_glock_wait,
+				   atomic_read(&sdp->sd_glock_disposal) == 0,
+				   HZ);
+		if (atomic_read(&sdp->sd_glock_disposal) == 0)
+			break;
+	}
 	glock_hash_walk(dump_glock_func, sdp);
 }
 
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 46ab67f..5fe645f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -203,6 +203,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
 extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
+extern long gfs2_rubbish_removal(long count);
 
 /**
  * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index be6d9c4..c343c2e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1534,6 +1534,7 @@ int gfs2_quotad(void *data)
 	unsigned long t = 0;
 	DEFINE_WAIT(wait);
 	int empty;
+	long removed;
 
 	while (!kthread_should_stop()) {
 
@@ -1555,6 +1556,9 @@ int gfs2_quotad(void *data)
 		/* Check for & recover partially truncated inodes */
 		quotad_check_trunc_list(sdp);
 
+		/* Destroy any glocks that need destroying */
+		removed = gfs2_rubbish_removal(2500);
+
 		try_to_freeze();
 
 		t = min(quotad_timeo, statfs_timeo);
@@ -1563,7 +1567,7 @@ int gfs2_quotad(void *data)
 		spin_lock(&sdp->sd_trunc_lock);
 		empty = list_empty(&sdp->sd_trunc_list);
 		spin_unlock(&sdp->sd_trunc_lock);
-		if (empty && !sdp->sd_statfs_force_sync)
+		if (empty && removed == 0 && !sdp->sd_statfs_force_sync)
 			t -= schedule_timeout(t);
 		else
 			t = 0;



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Delay glock disposal by way of a rubbish list
  2016-02-10 19:59     ` [Cluster-devel] [GFS2 PATCH] GFS2: Delay glock disposal by way of a rubbish list Bob Peterson
@ 2016-02-11 15:34       ` Steven Whitehouse
  0 siblings, 0 replies; 4+ messages in thread
From: Steven Whitehouse @ 2016-02-11 15:34 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 10/02/16 19:59, Bob Peterson wrote:
> Hi,
>
> ----- Original Message -----
>> Hi,
>>
>> On 18/12/15 18:42, Bob Peterson wrote:
>>> Hi,
>>>
>>> This patch introduces a new glock workqueue, gfs2_glock_final. The
>>> workqueue merely does work to call dlm's unlock.
>>> This prevents gfs2_evict_inode from calling dlm directly which
>>> might block, waiting for DLM to unlock, which may be waiting for
>>> something like a fence operation. By moving it to its own work queue,
>>> the final put happens later, which allows the shrinker to continue,
>>> and free memory, avoiding a livelock.
>> I don't think this is going to work... if the DLM lock put is delayed
>> then it would be possible for a new glock to be created (or the existing
>> one to be reused) if the inode gets recreated, and then there is no
>> coordination between the unlock and the new lock request. If those were
>> to be sent in the wrong order, then things are going to get very confused.
>>
>> If you add the coordination then we are back to square one, in terms of
>> the deadlock in this loop. The only way we can stop this is to try and
>> avoid inode deallocation in the eviction path. By the time we've already
>> decided to do the eviction, it is too late,
>>
>> Steve.
> Okay, I scrapped that idea and decided to do something very similar to
> what GFS1 did. That is, to use a daemon to eliminate glocks that are no
> longer needed. Rather than introduce a new daemon, I'm adding new function
> to the existing quota daemon. This version seems to be running just as
> well with the scenario I've been struggling with for so long.
>
> What do you think? If we can't do this, I'm out of ideas on how to solve it.
I don't think it is a good plan to go back to that system. It caused 
problems which we've long since solved. Lets take a step back and see if 
we can find a different solution here. I know it is not easy, and the 
code path is one of the more tricky in the whole of GFS2, which is why 
we need to be very careful.

I'm going to try and block out some time tomorrow and look at the 
remaining patches which you posted back in December which are still 
waiting for review, and I'll think a bit more about this issue at the 
same time, since they are all connected,

Steve.

> Regards,
>
> Bob Peterson
> Red Hat File Systems
> ---
> Patch description:
>
> This patch adds a new rubbish list for glocks. When a glock's reference
> count goes to zero, it gets thrown on the rubbish list, rather than
> being freed and unlocked in dlm immediately. If we tell dlm to unlock
> immediately, we run the risk of a livelock in which GFS2 waits forever
> for DLM to release the lock, DLM waits forever for a pending fence
> action, but the fence action waits forever for memory, which is
> itself blocks on the inode shrinker, which is trying to free inodes.
>
> With this new method, the glock gets tossed onto the rubbish list,
> then the inode free may continue. Thus, the shrinker may continue
> running, avoiding the live lock. The glocks on the rubbish list are
> processed periodically by the quota daemon, which is very similar to
> what GFS1 used to do (but GFS1 had several daemons; one was dedicated
> to this process).
>
> When the daemon processes the rubbish list, it first checks to see
> if the glock has been recreated in the hash tables. If so, it can't
> instruct DLM to unlock it; the new lock supercedes the dead one.
>
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
>
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 8dedece..703a4fd 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -67,6 +67,8 @@ struct workqueue_struct *gfs2_delete_workqueue;
>   static LIST_HEAD(lru_list);
>   static atomic_t lru_count = ATOMIC_INIT(0);
>   static DEFINE_SPINLOCK(lru_lock);
> +static LIST_HEAD(rubbish_list);
> +static DEFINE_SPINLOCK(rubbish_lock);
>   
>   #define GFS2_GL_HASH_SHIFT      15
>   #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
> @@ -160,7 +162,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
>   
>   void gfs2_glock_put(struct gfs2_glock *gl)
>   {
> -	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
>   	struct address_space *mapping = gfs2_glock2aspace(gl);
>   
>   	if (lockref_put_or_lock(&gl->gl_lockref))
> @@ -173,8 +174,44 @@ void gfs2_glock_put(struct gfs2_glock *gl)
>   	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
>   	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
>   	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
> -	trace_gfs2_glock_put(gl);
> -	sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
> +	spin_lock(&rubbish_lock);
> +	list_add_tail(&gl->gl_lru, &rubbish_list);
> +	spin_unlock(&rubbish_lock);
> +}
> +
> +/**
> + * gfs2_rubbish_removal - destroy glocks that are no longer wanted
> + */
> +long gfs2_rubbish_removal(long count)
> +{
> +	struct gfs2_glock *gl;
> +	struct gfs2_sbd *sdp;
> +	long removed = 0;
> +
> +	spin_lock(&rubbish_lock);
> +	while(count-- && !list_empty(&rubbish_list)) {
> +		gl = list_entry(rubbish_list.next, struct gfs2_glock, gl_lru);
> +		BUG_ON(!__lockref_is_dead(&gl->gl_lockref));
> +		sdp = gl->gl_name.ln_sbd;
> +		list_del_init(&gl->gl_lru);
> +		spin_unlock(&rubbish_lock);
> +
> +		/* If it was recreated after being sacked, don't tell dlm to
> +		   unlock. The new one supercedes the one in the rubbish. */
> +		if (rhashtable_lookup_fast(&gl_hash_table, &gl->gl_name,
> +					   ht_parms)) {
> +			gfs2_glock_free(gl);
> +		} else { /* was not recreated */
> +			trace_gfs2_glock_put(gl);
> +			sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
> +		}
> +		removed++;
> +		if (count && !list_empty(&rubbish_list))
> +			cond_resched();
> +		spin_lock(&rubbish_lock);
> +	}
> +	spin_unlock(&rubbish_lock);
> +	return removed;
>   }
>   
>   /**
> @@ -1503,13 +1540,21 @@ static void dump_glock_func(struct gfs2_glock *gl)
>   
>   void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
>   {
> +	long removed;
> +	int i;
> +
>   	set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
>   	flush_workqueue(glock_workqueue);
>   	glock_hash_walk(clear_glock, sdp);
>   	flush_workqueue(glock_workqueue);
> -	wait_event_timeout(sdp->sd_glock_wait,
> -			   atomic_read(&sdp->sd_glock_disposal) == 0,
> -			   HZ * 600);
> +	for (i = 0; i < 600; i++) {
> +		removed = gfs2_rubbish_removal(LONG_MAX);
> +		wait_event_timeout(sdp->sd_glock_wait,
> +				   atomic_read(&sdp->sd_glock_disposal) == 0,
> +				   HZ);
> +		if (atomic_read(&sdp->sd_glock_disposal) == 0)
> +			break;
> +	}
>   	glock_hash_walk(dump_glock_func, sdp);
>   }
>   
> diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
> index 46ab67f..5fe645f 100644
> --- a/fs/gfs2/glock.h
> +++ b/fs/gfs2/glock.h
> @@ -203,6 +203,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
>   #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
>   extern __printf(2, 3)
>   void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
> +extern long gfs2_rubbish_removal(long count);
>   
>   /**
>    * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
> diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
> index be6d9c4..c343c2e 100644
> --- a/fs/gfs2/quota.c
> +++ b/fs/gfs2/quota.c
> @@ -1534,6 +1534,7 @@ int gfs2_quotad(void *data)
>   	unsigned long t = 0;
>   	DEFINE_WAIT(wait);
>   	int empty;
> +	long removed;
>   
>   	while (!kthread_should_stop()) {
>   
> @@ -1555,6 +1556,9 @@ int gfs2_quotad(void *data)
>   		/* Check for & recover partially truncated inodes */
>   		quotad_check_trunc_list(sdp);
>   
> +		/* Destroy any glocks that need destroying */
> +		removed = gfs2_rubbish_removal(2500);
> +
>   		try_to_freeze();
>   
>   		t = min(quotad_timeo, statfs_timeo);
> @@ -1563,7 +1567,7 @@ int gfs2_quotad(void *data)
>   		spin_lock(&sdp->sd_trunc_lock);
>   		empty = list_empty(&sdp->sd_trunc_list);
>   		spin_unlock(&sdp->sd_trunc_lock);
> -		if (empty && !sdp->sd_statfs_force_sync)
> +		if (empty && removed == 0 && !sdp->sd_statfs_force_sync)
>   			t -= schedule_timeout(t);
>   		else
>   			t = 0;



^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2016-02-11 15:34 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <869029287.36038231.1450464080200.JavaMail.zimbra@redhat.com>
2015-12-18 18:42 ` [Cluster-devel] [GFS2 PATCH] GFS2: Queue final dlm unlock from gfs2_evict_inode Bob Peterson
2016-02-02 11:53   ` Steven Whitehouse
2016-02-10 19:59     ` [Cluster-devel] [GFS2 PATCH] GFS2: Delay glock disposal by way of a rubbish list Bob Peterson
2016-02-11 15:34       ` Steven Whitehouse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).