[Cluster-devel] [GFS2 PATCH] GFS2: Rework rgrp glock congestion functions for intra-node

cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed

* [Cluster-devel] [GFS2 PATCH] GFS2: Rework rgrp glock congestion functions for intra-node
       [not found] <1520935315.27803769.1498840932508.JavaMail.zimbra@redhat.com>
@ 2017-06-30 16:43 ` Bob Peterson
  2017-07-03  9:26   ` Steven Whitehouse
  0 siblings, 1 reply; 3+ messages in thread
From: Bob Peterson @ 2017-06-30 16:43 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

This patch reworks the "congestion" algorithms for resource group
glocks to take into account intra-node demands as well as inter-
node demands.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ad62bfb..903f58b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1807,6 +1807,21 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 }
 
 /**
+ * gfs2_rgrp_used_recently
+ * @rs: The block reservation with the rgrp to test
+ * @msecs: The time limit in milliseconds
+ *
+ * Returns: True if the rgrp glock has been used within the time limit
+ */
+static inline bool gfs2_rgrp_used_recently(const struct gfs2_rgrpd *rgd,
+					   u64 msecs)
+{
+	return (ktime_before(ktime_get_real(),
+			     ktime_add(rgd->rd_gl->gl_dstamp,
+				       ms_to_ktime(msecs))));
+}
+
+/**
  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
  * @rgd: The rgrp in question
  * @loops: An indication of how picky we can be (0=very, 1=less so)
@@ -1833,7 +1848,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
  * Returns: A boolean verdict on the congestion status
  */
 
-static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
+static bool gfs2_rgrp_congested_dlm(const struct gfs2_rgrpd *rgd, int loops)
 {
 	const struct gfs2_glock *gl = rgd->rd_gl;
 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -1845,6 +1860,11 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 	u64 var;
 	int cpu, nonzero = 0;
 
+	/* If it hasn't been used recently we can't judge the statistics, so
+	   assume it's not congested. */
+	if (!gfs2_rgrp_used_recently(rgd, HZ))
+		return false;
+
 	preempt_disable();
 	for_each_present_cpu(cpu) {
 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
@@ -1880,21 +1900,66 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 }
 
 /**
- * gfs2_rgrp_used_recently
- * @rs: The block reservation with the rgrp to test
- * @msecs: The time limit in milliseconds
+ * fast_to_acquire - determine if a resource group will be fast to acquire
  *
- * Returns: True if the rgrp glock has been used within the time limit
+ * If this is one of our preferred rgrps, it should be quicker to acquire,
+ * because we tried to set ourselves up as dlm lock master.
+ */
+static inline bool fast_to_acquire(const struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_glock *gl = rgd->rd_gl;
+
+	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
+	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+		return true;
+	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
+		return true;
+	return false;
+}
+
+/**
+ * gfs2_rgrp_congested - decide whether a rgrp glock is congested
+ * @rgd: The rgrp in question
+ * @loops: An indication of how picky we can be (0=very, 1=less so)
+ *
+ * There are two kinds of congestion: inter-node and intra-node.
+ *
+ * Inter-node congestion is where multiple nodes all want to allocate blocks
+ * inside the same rgrp, which means they need to trade the rgrp glock back
+ * and forth, which is a slow process. To mitigate this, we use glock
+ * statistics to predict whether the glock is historically fast to acquire.
+ *
+ * Intra-node congestion is where you have multiple processes on the same
+ * node, all trying to allocate blocks in the same rgrp. There's nothing wrong
+ * with doing so, but each process needs to wait for the other to release the
+ * rgrp glock before it may proceed. We can predict whether a rgrp glock is
+ * congested by how many block reservations are currently attached.
+ *
+ * Both kinds of congestion can hurt performance, but it's faster to check
+ * intra-node, so we do that first. After all, why bother to check if we can
+ * get the glock quickly from DLM if other processes have also used that
+ * same reasoning.
+ *
+ * We know the number of loops we've been around, so we know how desperate we
+ * are to find something. On first loop, call it congested if anyone else has
+ * a block reservation. On second loop, call it congested if it's not fast to
+ * acquire.
  */
-static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
-				    u64 msecs)
+static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 {
-	u64 tdiff;
+	/* Check for intra-node congestion */
+	if (loops == 0 && !RB_EMPTY_ROOT(&rgd->rd_rstree))
+		return true;
+
+	if (loops == 1 && !fast_to_acquire(rgd))
+		return true;
 
-	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
-                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
+	/* Check for inter-node congestion */
+	if (rgd->rd_sbd->sd_lockstruct.ls_ops->lm_lock) /* lock_dlm */
+		return gfs2_rgrp_congested_dlm(rgd, loops);
 
-	return tdiff > (msecs * 1000 * 1000);
+	return false;
 }
 
 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
@@ -1921,25 +1986,6 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
 }
 
 /**
- * fast_to_acquire - determine if a resource group will be fast to acquire
- *
- * If this is one of our preferred rgrps, it should be quicker to acquire,
- * because we tried to set ourselves up as dlm lock master.
- */
-static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
-{
-	struct gfs2_glock *gl = rgd->rd_gl;
-
-	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
-	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
-	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
-		return 1;
-	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
-		return 1;
-	return 0;
-}
-
-/**
  * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
  * @ap: the allocation parameters
@@ -1995,7 +2041,6 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 				    !fast_to_acquire(rs->rs_rbm.rgd))
 					goto next_rgrp;
 				if ((loops < 2) &&
-				    gfs2_rgrp_used_recently(rs, 1000) &&
 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
 					goto next_rgrp;
 			}



^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Rework rgrp glock congestion functions for intra-node
  2017-06-30 16:43 ` [Cluster-devel] [GFS2 PATCH] GFS2: Rework rgrp glock congestion functions for intra-node Bob Peterson
@ 2017-07-03  9:26   ` Steven Whitehouse
  0 siblings, 0 replies; 3+ messages in thread
From: Steven Whitehouse @ 2017-07-03  9:26 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,


On 30/06/17 17:43, Bob Peterson wrote:
> Hi,
>
> This patch reworks the "congestion" algorithms for resource group
> glocks to take into account intra-node demands as well as inter-
> node demands.
>
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
I'm not sure I really understand what this is trying to achieve. What is 
better about the resulting performance/block layout?

Steve.

> ---
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index ad62bfb..903f58b 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1807,6 +1807,21 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>   }
>   
>   /**
> + * gfs2_rgrp_used_recently
> + * @rs: The block reservation with the rgrp to test
> + * @msecs: The time limit in milliseconds
> + *
> + * Returns: True if the rgrp glock has been used within the time limit
> + */
> +static inline bool gfs2_rgrp_used_recently(const struct gfs2_rgrpd *rgd,
> +					   u64 msecs)
> +{
> +	return (ktime_before(ktime_get_real(),
> +			     ktime_add(rgd->rd_gl->gl_dstamp,
> +				       ms_to_ktime(msecs))));
> +}
> +
> +/**
>    * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
>    * @rgd: The rgrp in question
>    * @loops: An indication of how picky we can be (0=very, 1=less so)
> @@ -1833,7 +1848,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>    * Returns: A boolean verdict on the congestion status
>    */
>   
> -static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
> +static bool gfs2_rgrp_congested_dlm(const struct gfs2_rgrpd *rgd, int loops)
>   {
>   	const struct gfs2_glock *gl = rgd->rd_gl;
>   	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> @@ -1845,6 +1860,11 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
>   	u64 var;
>   	int cpu, nonzero = 0;
>   
> +	/* If it hasn't been used recently we can't judge the statistics, so
> +	   assume it's not congested. */
> +	if (!gfs2_rgrp_used_recently(rgd, HZ))
> +		return false;
> +
>   	preempt_disable();
>   	for_each_present_cpu(cpu) {
>   		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
> @@ -1880,21 +1900,66 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
>   }
>   
>   /**
> - * gfs2_rgrp_used_recently
> - * @rs: The block reservation with the rgrp to test
> - * @msecs: The time limit in milliseconds
> + * fast_to_acquire - determine if a resource group will be fast to acquire
>    *
> - * Returns: True if the rgrp glock has been used within the time limit
> + * If this is one of our preferred rgrps, it should be quicker to acquire,
> + * because we tried to set ourselves up as dlm lock master.
> + */
> +static inline bool fast_to_acquire(const struct gfs2_rgrpd *rgd)
> +{
> +	struct gfs2_glock *gl = rgd->rd_gl;
> +
> +	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
> +	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
> +	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
> +		return true;
> +	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
> +		return true;
> +	return false;
> +}
> +
> +/**
> + * gfs2_rgrp_congested - decide whether a rgrp glock is congested
> + * @rgd: The rgrp in question
> + * @loops: An indication of how picky we can be (0=very, 1=less so)
> + *
> + * There are two kinds of congestion: inter-node and intra-node.
> + *
> + * Inter-node congestion is where multiple nodes all want to allocate blocks
> + * inside the same rgrp, which means they need to trade the rgrp glock back
> + * and forth, which is a slow process. To mitigate this, we use glock
> + * statistics to predict whether the glock is historically fast to acquire.
> + *
> + * Intra-node congestion is where you have multiple processes on the same
> + * node, all trying to allocate blocks in the same rgrp. There's nothing wrong
> + * with doing so, but each process needs to wait for the other to release the
> + * rgrp glock before it may proceed. We can predict whether a rgrp glock is
> + * congested by how many block reservations are currently attached.
> + *
> + * Both kinds of congestion can hurt performance, but it's faster to check
> + * intra-node, so we do that first. After all, why bother to check if we can
> + * get the glock quickly from DLM if other processes have also used that
> + * same reasoning.
> + *
> + * We know the number of loops we've been around, so we know how desperate we
> + * are to find something. On first loop, call it congested if anyone else has
> + * a block reservation. On second loop, call it congested if it's not fast to
> + * acquire.
>    */
> -static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
> -				    u64 msecs)
> +static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
>   {
> -	u64 tdiff;
> +	/* Check for intra-node congestion */
> +	if (loops == 0 && !RB_EMPTY_ROOT(&rgd->rd_rstree))
> +		return true;
> +
> +	if (loops == 1 && !fast_to_acquire(rgd))
> +		return true;
>   
> -	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
> -                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
> +	/* Check for inter-node congestion */
> +	if (rgd->rd_sbd->sd_lockstruct.ls_ops->lm_lock) /* lock_dlm */
> +		return gfs2_rgrp_congested_dlm(rgd, loops);
>   
> -	return tdiff > (msecs * 1000 * 1000);
> +	return false;
>   }
>   
>   static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
> @@ -1921,25 +1986,6 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
>   }
>   
>   /**
> - * fast_to_acquire - determine if a resource group will be fast to acquire
> - *
> - * If this is one of our preferred rgrps, it should be quicker to acquire,
> - * because we tried to set ourselves up as dlm lock master.
> - */
> -static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
> -{
> -	struct gfs2_glock *gl = rgd->rd_gl;
> -
> -	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
> -	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
> -	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
> -		return 1;
> -	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
> -		return 1;
> -	return 0;
> -}
> -
> -/**
>    * gfs2_inplace_reserve - Reserve space in the filesystem
>    * @ip: the inode to reserve space for
>    * @ap: the allocation parameters
> @@ -1995,7 +2041,6 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
>   				    !fast_to_acquire(rs->rs_rbm.rgd))
>   					goto next_rgrp;
>   				if ((loops < 2) &&
> -				    gfs2_rgrp_used_recently(rs, 1000) &&
>   				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
>   					goto next_rgrp;
>   			}



^ permalink raw reply	[flat|nested] 3+ messages in thread

* [Cluster-devel] [GFS2 PATCH] GFS2: Rework rgrp glock congestion functions for intra-node
       [not found] <1348332894.38934178.1501871083680.JavaMail.zimbra@redhat.com>
@ 2017-08-04 18:43 ` Bob Peterson
  0 siblings, 0 replies; 3+ messages in thread
From: Bob Peterson @ 2017-08-04 18:43 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Hi,

On 30 June, I posted a patch by the same name, with little
explanation. Steve Whitehouse had some doubts, saying:

> I'm not sure I really understand what this is trying to achieve. What is
> better about the resulting performance/block layout?

Since that time, I've made some revisions and improvements to
the patch, plus I've expanded the patch description to better
explain what I'm trying to accomplish.

In addition, I've attached a spreadsheet / chart showing the
overall throughput for the single-node iozone benchmark running
with a varying number of concurrent processes, from a particular
performance testing machine. This clearly demonstrates a non-trivial
(noticeable) performance improvement with the patch versus the
stock RHEL7.4 kernel.

Patch description:
------------------
There are two kinds of congestion: inter-node and intra-node.

Inter-node congestion is where multiple nodes want to allocate
blocks inside the same rgrp, which means they need to trade the
rgrp glock back and forth, which is a slow process. To mitigate
this, we use glock statistics to predict whether the glock is
historically fast to acquire. This hasn't really changed.

Intra-node congestion is where you have multiple processes on
the same node, all trying to allocate blocks in the same rgrp.
There's nothing wrong with doing so, but each process needs to
wait for the other to release the rgrp glock before it may
proceed, and thus it's slower than it can be.

If each of those processes operated on separate rgrps, they
wouldn't need to do all this glock waiting, and thus would be
faster.

Both kinds of congestion can hurt performance, but it's faster
to check intra-node, so we do that first. After all, why bother
to check if we can get the glock quickly from DLM if other
processes have also used that same reasoning: They'd just
generate intranode congestion.

So this patch reworks the "congestion" algorithms for resource
group glocks to take into account intra-node demands as well as
inter-node demands.

We can predict whether a rgrp glock is the victim of intranode
congestion based on set of rules. Rules Enforced by this patch:

1. If the current process has a multi-block reservation in the
   rgrp, it needs to use it regardless of the congestion. The
   congestion algorithm should have prevented the reservation
   in the first place.
2. If some process has the rgrp gl_lockref spin_lock locked,
   they are preparing to use it for a reservation. So we take
   this as a clear sign of impending contention.
3. If the rgrp currently has a glock holder, we know we need to
   wait before we can lock it, regardless of whether the holder
   represents an actual holder or a waiter.
4. If the rgrp currently has a multi-block reservation, and we
   already know it's not ours, then intranode contention is
   likely.
5. If none of these conditions are true, we check to see if we
   can acquire (lock / enqueue) the glock relatively quickly.
   If this is lock_dlm protocol, we check if the rgrp is one
   of our preferred rgrps. If so, we treat it as fast. As it
   was before this patch, for lock_nolock protocol, all rgrps
   are considered "preferred."
6. If the rgrp glock is unlocked, it's generally not fast to
   acquire. At the least, we need to push it through the glock
   state machine and read it from the media. Worst case, we also
   need to get dlm's permission to do so. This is ignored for
   preferred rgrps, since we want to set up easy access to them
   anyway.
7. If the DEMOTE or DEMOTE_IN_PROGRESS bits are set, we know we
   have an even longer wait from the glock state machine, as
   any enqueues will need to wait for the glock to be demoted,
   then promoted again.
8. If, after all this, we deem the rgrp to be good enough for
   this attempt (loop), and the locking protocol is lock_dlm,
   we do the normal checks we did before this patch. Namely, we
   check the internode locking statistics kept by lock_dlm to
   see if it's a hot spot for the cluster.

Note that with each "loop" through the list of rgrps, we become
more lax with our requirements. So the first time through, we
can be more picky about using an rgrp no one else is using.
But on subsequent passes through the rgrps, we need to accept
rgrps that are less than ideal.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 29fbeee36fa6..daf87750260d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1806,7 +1806,22 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 }

 /**
- * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
+ * gfs2_rgrp_used_recently
+ * @rs: The block reservation with the rgrp to test
+ * @msecs: The time limit in milliseconds
+ *
+ * Returns: True if the rgrp glock has been used within the time limit
+ */
+static inline bool gfs2_rgrp_used_recently(const struct gfs2_rgrpd *rgd,
+					   u64 msecs)
+{
+	u64 tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
+					  rgd->rd_gl->gl_dstamp));
+	return (tdiff > (msecs * 1000 * 1000));
+}
+
+/**
+ * gfs2_rgrp_congested_dlm - Use stats to figure out if an rgrp is congested
  * @rgd: The rgrp in question
  * @loops: An indication of how picky we can be (0=very, 1=less so)
  *
@@ -1832,7 +1847,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
  * Returns: A boolean verdict on the congestion status
  */

-static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
+static bool gfs2_rgrp_congested_dlm(const struct gfs2_rgrpd *rgd, int loops)
 {
 	const struct gfs2_glock *gl = rgd->rd_gl;
 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -1844,6 +1859,11 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 	u64 var;
 	int cpu, nonzero = 0;

+	/* If it hasn't been used recently we can't judge the statistics, so
+	   assume it's not congested. */
+	if (!gfs2_rgrp_used_recently(rgd, HZ))
+		return false;
+
 	preempt_disable();
 	for_each_present_cpu(cpu) {
 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
@@ -1879,21 +1899,110 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
 }

 /**
- * gfs2_rgrp_used_recently
- * @rs: The block reservation with the rgrp to test
- * @msecs: The time limit in milliseconds
+ * fast_to_acquire - determine if a resource group will be fast to acquire
  *
- * Returns: True if the rgrp glock has been used within the time limit
+ * If this is one of our preferred rgrps, it should be quicker to acquire,
+ * because we tried to set ourselves up as dlm lock master.
+ */
+static inline bool fast_to_acquire(const struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_glock *gl = rgd->rd_gl;
+
+	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
+		return true;
+	if (gl->gl_state == LM_ST_UNLOCKED)
+		return false;
+	if (!list_empty(&gl->gl_holders))
+		return false;
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) ||
+	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+		return false;
+	return true;
+}
+
+/**
+ * other_rgrp_users - figure out if this rgrp has other users
+ * @rgd: The resource group
+ * @locked: true if we've already held the glock
+ *
+ * We're trying to figure out if the given rgrp has anybody competing for
+ * its free space. If other processes have enqueued its glock, there's a
+ * good chance of competition.
+ *
+ * If there are multi-block reservations for this rgrp, there's a good
+ * chance another process will lock the rgrp for block allocations soon.
+ *
+ * If we've already held the glock, we no longer care if there are holders
+ * because that's now a given (rgrp glocks are never shared).
+ */
+static inline bool other_rgrp_users(const struct gfs2_rgrpd *rgd, bool locked)
+{
+	struct gfs2_glock *gl = rgd->rd_gl;
+
+	if (spin_is_locked(&gl->gl_lockref.lock)) /* someone preparing to use it. */
+		return true;
+	if (!locked && !list_empty(&gl->gl_holders))
+		return true;
+	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
+		return true;
+	return false;
+}
+
+/**
+ * gfs2_rgrp_congested - decide whether a rgrp glock is congested
+ * @rgd: The rgrp in question
+ * @loops: An indication of how picky we can be (0=very, 1=less so)
+ * @locked: Indicates if checks are before or after we've enqueued the glock.
+ *
+ * There are two kinds of congestion: inter-node and intra-node.
+ *
+ * Inter-node congestion is where multiple nodes all want to allocate blocks
+ * inside the same rgrp, which means they need to trade the rgrp glock back
+ * and forth, which is a slow process. To mitigate this, we use glock
+ * statistics to predict whether the glock is historically fast to acquire.
+ *
+ * Intra-node congestion is where you have multiple processes on the same
+ * node, all trying to allocate blocks in the same rgrp. There's nothing wrong
+ * with doing so, but each process needs to wait for the other to release the
+ * rgrp glock before it may proceed. We can predict whether a rgrp glock is
+ * congested by how many block reservations are currently attached.
+ *
+ * Both kinds of congestion can hurt performance, but it's faster to check
+ * intra-node, so we do that first. After all, why bother to check if we can
+ * get the glock quickly from DLM if other processes have also used that
+ * same reasoning.
+ *
+ * We know the number of loops we've been around, so we know how desperate we
+ * are to find something. On first loop, call it congested if anyone else has
+ * a block reservation. On second loop, call it congested if it's not fast to
+ * acquire.
  */
-static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
-				    u64 msecs)
+static bool gfs2_rgrp_congested(const struct gfs2_blkreserv *rs, int loops,
+				bool locked)
 {
-	u64 tdiff;
+	const struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
+
+	/* We already have a reservation, we need to use it regardless */
+	if (gfs2_rs_active(rs))
+		return false;
+
+	/* If we've rejected all the rgrps a few times, can no longer worry
+	   about whether the rgrp is congested. Fill in blocks where we can. */
+	if (loops >= 2)
+		return false;
+
+	/* Check for intra-node congestion */
+	if (loops == 0 && other_rgrp_users(rgd, locked))
+		return true;
+
+	if (loops <= 1 && !fast_to_acquire(rgd))
+		return true;

-	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
-                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
+	/* Check for inter-node congestion */
+	if (rgd->rd_sbd->sd_lockstruct.ls_ops->lm_lock) /* lock_dlm */
+		return gfs2_rgrp_congested_dlm(rgd, loops);

-	return tdiff > (msecs * 1000 * 1000);
+	return false;
 }

 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
@@ -1920,25 +2029,6 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
 }

 /**
- * fast_to_acquire - determine if a resource group will be fast to acquire
- *
- * If this is one of our preferred rgrps, it should be quicker to acquire,
- * because we tried to set ourselves up as dlm lock master.
- */
-static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
-{
-	struct gfs2_glock *gl = rgd->rd_gl;
-
-	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
-	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
-	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
-		return 1;
-	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
-		return 1;
-	return 0;
-}
-
-/**
  * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
  * @ap: the allocation parameters
@@ -1988,22 +2078,14 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 			rg_locked = 0;
 			if (skip && skip--)
 				goto next_rgrp;
-			if (!gfs2_rs_active(rs)) {
-				if (loops == 0 &&
-				    !fast_to_acquire(rs->rs_rbm.rgd))
+			if (gfs2_rgrp_congested(rs, loops, false))
 					goto next_rgrp;
-				if ((loops < 2) &&
-				    gfs2_rgrp_used_recently(rs, 1000) &&
-				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
-					goto next_rgrp;
-			}
 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
 						   LM_ST_EXCLUSIVE, flags,
 						   &rs->rs_rgd_gh);
 			if (unlikely(error))
 				return error;
-			if (!gfs2_rs_active(rs) && (loops < 2) &&
-			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+			if (gfs2_rgrp_congested(rs, loops, true))
 				goto skip_rgrp;
 			if (sdp->sd_args.ar_rgrplvb) {
 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: iozone.scalability2.ods
Type: application/vnd.oasis.opendocument.spreadsheet
Size: 19978 bytes
Desc: not available
URL: <http://listman.redhat.com/archives/cluster-devel/attachments/20170804/a43360c7/attachment.ods>

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-08-04 18:43 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1520935315.27803769.1498840932508.JavaMail.zimbra@redhat.com>
2017-06-30 16:43 ` [Cluster-devel] [GFS2 PATCH] GFS2: Rework rgrp glock congestion functions for intra-node Bob Peterson
2017-07-03  9:26   ` Steven Whitehouse
     [not found] <1348332894.38934178.1501871083680.JavaMail.zimbra@redhat.com>
2017-08-04 18:43 ` Bob Peterson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).