From mboxrd@z Thu Jan  1 00:00:00 1970
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 3 Jul 2017 10:26:31 +0100
Subject: [Cluster-devel] [GFS2 PATCH] GFS2: Rework rgrp glock congestion
 functions for intra-node
In-Reply-To: <513776377.27803960.1498841020404.JavaMail.zimbra@redhat.com>
References: <513776377.27803960.1498841020404.JavaMail.zimbra@redhat.com>
Message-ID: <155a05d7-f43e-f24a-c67a-b92e7e5974be@redhat.com>
List-Id: <cluster-devel.redhat.com>
To: cluster-devel.redhat.com
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

Hi,


On 30/06/17 17:43, Bob Peterson wrote:
> Hi,
>
> This patch reworks the "congestion" algorithms for resource group
> glocks to take into account intra-node demands as well as inter-
> node demands.
>
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
I'm not sure I really understand what this is trying to achieve. What is 
better about the resulting performance/block layout?

Steve.

> ---
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index ad62bfb..903f58b 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1807,6 +1807,21 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>   }
>   
>   /**
> + * gfs2_rgrp_used_recently
> + * @rs: The block reservation with the rgrp to test
> + * @msecs: The time limit in milliseconds
> + *
> + * Returns: True if the rgrp glock has been used within the time limit
> + */
> +static inline bool gfs2_rgrp_used_recently(const struct gfs2_rgrpd *rgd,
> +					   u64 msecs)
> +{
> +	return (ktime_before(ktime_get_real(),
> +			     ktime_add(rgd->rd_gl->gl_dstamp,
> +				       ms_to_ktime(msecs))));
> +}
> +
> +/**
>    * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
>    * @rgd: The rgrp in question
>    * @loops: An indication of how picky we can be (0=very, 1=less so)
> @@ -1833,7 +1848,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>    * Returns: A boolean verdict on the congestion status
>    */
>   
> -static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
> +static bool gfs2_rgrp_congested_dlm(const struct gfs2_rgrpd *rgd, int loops)
>   {
>   	const struct gfs2_glock *gl = rgd->rd_gl;
>   	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> @@ -1845,6 +1860,11 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
>   	u64 var;
>   	int cpu, nonzero = 0;
>   
> +	/* If it hasn't been used recently we can't judge the statistics, so
> +	   assume it's not congested. */
> +	if (!gfs2_rgrp_used_recently(rgd, HZ))
> +		return false;
> +
>   	preempt_disable();
>   	for_each_present_cpu(cpu) {
>   		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
> @@ -1880,21 +1900,66 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
>   }
>   
>   /**
> - * gfs2_rgrp_used_recently
> - * @rs: The block reservation with the rgrp to test
> - * @msecs: The time limit in milliseconds
> + * fast_to_acquire - determine if a resource group will be fast to acquire
>    *
> - * Returns: True if the rgrp glock has been used within the time limit
> + * If this is one of our preferred rgrps, it should be quicker to acquire,
> + * because we tried to set ourselves up as dlm lock master.
> + */
> +static inline bool fast_to_acquire(const struct gfs2_rgrpd *rgd)
> +{
> +	struct gfs2_glock *gl = rgd->rd_gl;
> +
> +	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
> +	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
> +	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
> +		return true;
> +	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
> +		return true;
> +	return false;
> +}
> +
> +/**
> + * gfs2_rgrp_congested - decide whether a rgrp glock is congested
> + * @rgd: The rgrp in question
> + * @loops: An indication of how picky we can be (0=very, 1=less so)
> + *
> + * There are two kinds of congestion: inter-node and intra-node.
> + *
> + * Inter-node congestion is where multiple nodes all want to allocate blocks
> + * inside the same rgrp, which means they need to trade the rgrp glock back
> + * and forth, which is a slow process. To mitigate this, we use glock
> + * statistics to predict whether the glock is historically fast to acquire.
> + *
> + * Intra-node congestion is where you have multiple processes on the same
> + * node, all trying to allocate blocks in the same rgrp. There's nothing wrong
> + * with doing so, but each process needs to wait for the other to release the
> + * rgrp glock before it may proceed. We can predict whether a rgrp glock is
> + * congested by how many block reservations are currently attached.
> + *
> + * Both kinds of congestion can hurt performance, but it's faster to check
> + * intra-node, so we do that first. After all, why bother to check if we can
> + * get the glock quickly from DLM if other processes have also used that
> + * same reasoning.
> + *
> + * We know the number of loops we've been around, so we know how desperate we
> + * are to find something. On first loop, call it congested if anyone else has
> + * a block reservation. On second loop, call it congested if it's not fast to
> + * acquire.
>    */
> -static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
> -				    u64 msecs)
> +static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
>   {
> -	u64 tdiff;
> +	/* Check for intra-node congestion */
> +	if (loops == 0 && !RB_EMPTY_ROOT(&rgd->rd_rstree))
> +		return true;
> +
> +	if (loops == 1 && !fast_to_acquire(rgd))
> +		return true;
>   
> -	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
> -                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
> +	/* Check for inter-node congestion */
> +	if (rgd->rd_sbd->sd_lockstruct.ls_ops->lm_lock) /* lock_dlm */
> +		return gfs2_rgrp_congested_dlm(rgd, loops);
>   
> -	return tdiff > (msecs * 1000 * 1000);
> +	return false;
>   }
>   
>   static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
> @@ -1921,25 +1986,6 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
>   }
>   
>   /**
> - * fast_to_acquire - determine if a resource group will be fast to acquire
> - *
> - * If this is one of our preferred rgrps, it should be quicker to acquire,
> - * because we tried to set ourselves up as dlm lock master.
> - */
> -static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
> -{
> -	struct gfs2_glock *gl = rgd->rd_gl;
> -
> -	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
> -	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
> -	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
> -		return 1;
> -	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
> -		return 1;
> -	return 0;
> -}
> -
> -/**
>    * gfs2_inplace_reserve - Reserve space in the filesystem
>    * @ip: the inode to reserve space for
>    * @ap: the allocation parameters
> @@ -1995,7 +2041,6 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
>   				    !fast_to_acquire(rs->rs_rbm.rgd))
>   					goto next_rgrp;
>   				if ((loops < 2) &&
> -				    gfs2_rgrp_used_recently(rs, 1000) &&
>   				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
>   					goto next_rgrp;
>   			}