[Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention

cluster-devel.redhat.com archive mirror
 help / color / mirror / Atom feed

From: Steven Whitehouse <swhiteho@redhat.com>
To: cluster-devel.redhat.com
Subject: [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention
Date: Mon, 3 Jul 2017 10:21:19 +0100	[thread overview]
Message-ID: <7026f0ef-002b-bf83-4658-1cbb2fbfdd82@redhat.com> (raw)
In-Reply-To: <1075213831.27823397.1498846716685.JavaMail.zimbra@redhat.com>

Hi,


On 30/06/17 19:18, Bob Peterson wrote:
> Hi,
>
> This patch changes GFS2's LRU list from a single linked list with
> a much-contended spin_lock to a hash table of linked lists, thus
> having more locks with less contention. The key for the table is
> set round-robin to evenly distribute the glocks into the buckets.
>
> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
This is a bit confused I think. There needs to be some more work done 
here to figure out exactly what the problem is, but I'm not very keen on 
this solution for a number of reasons...

  - It still claims to be LRU, but in fact it isn't any more
  - The list might have been broken up, but the counter is still global, 
so that there is still the possibility for contention on that
  - There is no performance data to say whether or not there is any 
improvement or not
  - There should not in general be huge numbers of LRU operations on 
glocks anyway, so why is the LRU list contended in the first place?
  - It introduces a new global counter at glock creation time, so 
something else that might land up being a contention point

The important question is what causes the contention on the LRU list in 
the first place - ideally we'd look at that issue in order to avoid so 
many list operations, rather than making it very complicated and non-LRU 
- at the very least there should be a justification for the new algorithm,

Steve.


> ---
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 6cd71c5..e1e16c4 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -63,10 +63,32 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
>   
>   static struct dentry *gfs2_root;
>   static struct workqueue_struct *glock_workqueue;
> +static atomic_t lru_bucket = ATOMIC_INIT(0);
> +
>   struct workqueue_struct *gfs2_delete_workqueue;
> -static LIST_HEAD(lru_list);
> +
> +#define LRU_HASH_BITS 12
> +#define LRU_HASH_BUCKETS BIT(LRU_HASH_BITS)
> +#define LRU_HASH_MASK (LRU_HASH_BUCKETS - 1)
> +
> +struct gfs2_lru_bucket {
> +	struct list_head lru_list;
> +	rwlock_t lru_lock;
> +};
> +
> +static struct gfs2_lru_bucket lru_locks[LRU_HASH_BUCKETS];
> +
> +static inline void lock_lru_bucket(unsigned bucket)
> +{
> +	write_lock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
> +}
> +
> +static inline void unlock_lru_bucket(unsigned bucket)
> +{
> +	write_unlock(&lru_locks[bucket & LRU_HASH_MASK].lru_lock);
> +}
> +
>   static atomic_t lru_count = ATOMIC_INIT(0);
> -static DEFINE_SPINLOCK(lru_lock);
>   
>   #define GFS2_GL_HASH_SHIFT      15
>   #define GFS2_GL_HASH_SIZE       BIT(GFS2_GL_HASH_SHIFT)
> @@ -126,30 +148,36 @@ static int demote_ok(const struct gfs2_glock *gl)
>   	return 1;
>   }
>   
> -
> -void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
> +static void remove_from_lru_locked(struct gfs2_glock *gl)
>   {
> -	spin_lock(&lru_lock);
> -
> -	if (!list_empty(&gl->gl_lru))
> +	if (!list_empty(&gl->gl_lru)) {
>   		list_del_init(&gl->gl_lru);
> -	else
> -		atomic_inc(&lru_count);
> +		atomic_dec(&lru_count);
> +	}
> +}
> +
> +static void glock_remove_from_lru(struct gfs2_glock *gl)
> +{
> +	if (!test_and_clear_bit(GLF_LRU, &gl->gl_flags))
> +		return;
>   
> -	list_add_tail(&gl->gl_lru, &lru_list);
> -	set_bit(GLF_LRU, &gl->gl_flags);
> -	spin_unlock(&lru_lock);
> +	lock_lru_bucket(gl->gl_lru_bucket);
> +	remove_from_lru_locked(gl);
> +	unlock_lru_bucket(gl->gl_lru_bucket);
>   }
>   
> -static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
> +void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
>   {
> -	spin_lock(&lru_lock);
> -	if (!list_empty(&gl->gl_lru)) {
> -		list_del_init(&gl->gl_lru);
> -		atomic_dec(&lru_count);
> -		clear_bit(GLF_LRU, &gl->gl_flags);
> -	}
> -	spin_unlock(&lru_lock);
> +	glock_remove_from_lru(gl);
> +	if (test_and_set_bit(GLF_LRU, &gl->gl_flags))
> +		return;
> +
> +	lock_lru_bucket(gl->gl_lru_bucket);
> +	BUG_ON(!list_empty(&gl->gl_lru));
> +	atomic_inc(&lru_count);
> +	list_add_tail(&gl->gl_lru,
> +		      &lru_locks[gl->gl_lru_bucket & LRU_HASH_MASK].lru_list);
> +	unlock_lru_bucket(gl->gl_lru_bucket);
>   }
>   
>   /*
> @@ -182,7 +210,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
>   
>   	lockref_mark_dead(&gl->gl_lockref);
>   
> -	gfs2_glock_remove_from_lru(gl);
> +	glock_remove_from_lru(gl);
>   	spin_unlock(&gl->gl_lockref.lock);
>   	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
>   	GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
> @@ -746,6 +774,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
>   	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
>   	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
>   	INIT_WORK(&gl->gl_delete, delete_work_func);
> +	gl->gl_lru_bucket = atomic_inc_return(&lru_bucket);
> +	if (gl->gl_lru_bucket >= LRU_HASH_BUCKETS) {
> +		gl->gl_lru_bucket = 0;
> +		atomic_set(&lru_bucket, gl->gl_lru_bucket);
> +	}
>   
>   	mapping = gfs2_glock2aspace(gl);
>   	if (mapping) {
> @@ -1010,8 +1043,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
>   	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
>   		return -EIO;
>   
> -	if (test_bit(GLF_LRU, &gl->gl_flags))
> -		gfs2_glock_remove_from_lru(gl);
> +	glock_remove_from_lru(gl);
>   
>   	spin_lock(&gl->gl_lockref.lock);
>   	add_to_queue(gh);
> @@ -1343,24 +1375,20 @@ static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
>   }
>   
>   /**
> - * gfs2_dispose_glock_lru - Demote a list of glocks
> + * gfs2_dispose_glock_lru - Demote a list of glocks from the same LRU bucket
>    * @list: The list to dispose of
>    *
>    * Disposing of glocks may involve disk accesses, so that here we sort
>    * the glocks by number (i.e. disk location of the inodes) so that if
>    * there are any such accesses, they'll be sent in order (mostly).
> - *
> - * Must be called under the lru_lock, but may drop and retake this
> - * lock. While the lru_lock is dropped, entries may vanish from the
> - * list, but no new entries will appear on the list (since it is
> - * private)
>    */
>   
> -static void gfs2_dispose_glock_lru(struct list_head *list)
> -__releases(&lru_lock)
> -__acquires(&lru_lock)
> +static long gfs2_dispose_glock_lru(struct list_head *list)
>   {
> -	struct gfs2_glock *gl;
> +	struct gfs2_glock *gl = NULL;
> +	LIST_HEAD(rejects);
> +	int reject_count = 0;
> +	long disposed = 0;
>   
>   	list_sort(NULL, list, glock_cmp);
>   
> @@ -1369,23 +1397,30 @@ __acquires(&lru_lock)
>   		list_del_init(&gl->gl_lru);
>   		if (!spin_trylock(&gl->gl_lockref.lock)) {
>   add_back_to_lru:
> -			list_add(&gl->gl_lru, &lru_list);
> -			atomic_inc(&lru_count);
> +			list_add_tail(&gl->gl_lru, &rejects);
> +			reject_count++;
>   			continue;
>   		}
>   		if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
>   			spin_unlock(&gl->gl_lockref.lock);
>   			goto add_back_to_lru;
>   		}
> -		clear_bit(GLF_LRU, &gl->gl_flags);
> +		disposed++;
>   		gl->gl_lockref.count++;
>   		if (demote_ok(gl))
>   			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
>   		WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
>   		__gfs2_glock_queue_work(gl, 0);
>   		spin_unlock(&gl->gl_lockref.lock);
> -		cond_resched_lock(&lru_lock);
>   	}
> +	if (!list_empty(&rejects)) {
> +		gl = list_first_entry(&rejects, struct gfs2_glock, gl_lru);
> +		lock_lru_bucket(gl->gl_lru_bucket);
> +		list_splice(&rejects, &lru_locks[gl->gl_lru_bucket].lru_list);
> +		atomic_add(reject_count, &lru_count);
> +		unlock_lru_bucket(gl->gl_lru_bucket);
> +	}
> +	return disposed;
>   }
>   
>   /**
> @@ -1399,30 +1434,33 @@ __acquires(&lru_lock)
>   
>   static long gfs2_scan_glock_lru(int nr)
>   {
> -	struct gfs2_glock *gl;
> -	LIST_HEAD(skipped);
> +	struct gfs2_glock *gl, *tmp;
>   	LIST_HEAD(dispose);
> +	int start_bucket, bucket;
>   	long freed = 0;
> +	static int last_lru_scan = 0;
>   
> -	spin_lock(&lru_lock);
> -	while ((nr-- >= 0) && !list_empty(&lru_list)) {
> -		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
> -
> -		/* Test for being demotable */
> -		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
> -			list_move(&gl->gl_lru, &dispose);
> -			atomic_dec(&lru_count);
> -			freed++;
> -			continue;
> +	start_bucket = bucket = (last_lru_scan & LRU_HASH_MASK);
> +	do {
> +		lock_lru_bucket(bucket);
> +		list_for_each_entry_safe(gl, tmp, &lru_locks[bucket].lru_list,
> +					 gl_lru) {
> +			/* Test for being demotable */
> +			if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
> +				if (test_and_clear_bit(GLF_LRU, &gl->gl_flags))
> +					remove_from_lru_locked(gl);
> +				list_add_tail(&gl->gl_lru, &dispose);
> +				nr--;
> +				if (nr == 0)
> +					break;
> +			}
>   		}
> -
> -		list_move(&gl->gl_lru, &skipped);
> -	}
> -	list_splice(&skipped, &lru_list);
> -	if (!list_empty(&dispose))
> -		gfs2_dispose_glock_lru(&dispose);
> -	spin_unlock(&lru_lock);
> -
> +		unlock_lru_bucket(bucket);
> +		if (!list_empty(&dispose))
> +			freed += gfs2_dispose_glock_lru(&dispose);
> +		bucket = (bucket + 1) & LRU_HASH_MASK;
> +	} while (nr && (bucket != start_bucket));
> +	last_lru_scan = bucket;
>   	return freed;
>   }
>   
> @@ -1504,7 +1542,7 @@ static void thaw_glock(struct gfs2_glock *gl)
>   
>   static void clear_glock(struct gfs2_glock *gl)
>   {
> -	gfs2_glock_remove_from_lru(gl);
> +	glock_remove_from_lru(gl);
>   
>   	spin_lock(&gl->gl_lockref.lock);
>   	if (gl->gl_state != LM_ST_UNLOCKED)
> @@ -1704,7 +1742,7 @@ void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
>   	dtime *= 1000000/HZ; /* demote time in uSec */
>   	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
>   		dtime = 0;
> -	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld\n",
> +	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d m:%ld l:%d\n",
>   		  state2str(gl->gl_state),
>   		  gl->gl_name.ln_type,
>   		  (unsigned long long)gl->gl_name.ln_number,
> @@ -1713,7 +1751,8 @@ void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
>   		  state2str(gl->gl_demote_state), dtime,
>   		  atomic_read(&gl->gl_ail_count),
>   		  atomic_read(&gl->gl_revokes),
> -		  (int)gl->gl_lockref.count, gl->gl_hold_time);
> +		  (int)gl->gl_lockref.count, gl->gl_hold_time,
> +		  gl->gl_lru_bucket);
>   
>   	list_for_each_entry(gh, &gl->gl_holders, gh_list)
>   		dump_holder(seq, gh);
> @@ -1797,6 +1836,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
>   int __init gfs2_glock_init(void)
>   {
>   	int ret;
> +	unsigned i;
> +
> +	for (i = 0; i < LRU_HASH_BUCKETS; i++) {
> +		INIT_LIST_HEAD(&lru_locks[i].lru_list);
> +		rwlock_init(&lru_locks[i].lru_lock);
> +	}
>   
>   	ret = rhashtable_init(&gl_hash_table, &ht_parms);
>   	if (ret < 0)
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 2ad003b..e8c0f2b 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -374,6 +374,7 @@ struct gfs2_glock {
>   		} gl_vm;
>   	};
>   	struct rhash_head gl_node;
> +	int gl_lru_bucket;
>   };
>   
>   #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
>

next prev parent reply	other threads:[~2017-07-03  9:21 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <1321492262.27823235.1498846645921.JavaMail.zimbra@redhat.com>
2017-06-30 18:18 ` [Cluster-devel] [GFS2 PATCH] GFS2: Convert LRU list to use a hash table to reduce contention Bob Peterson
2017-07-03  9:21   ` Steven Whitehouse [this message]
2017-07-03 20:31     ` Bob Peterson
2017-07-04  8:55       ` Steven Whitehouse

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7026f0ef-002b-bf83-4658-1cbb2fbfdd82@redhat.com \
    --to=swhiteho@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).