[PATCH 14/19] staging/lustre/obdclass: Eliminate hash bucket scans in lu_cache_shrink

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: green@linuxhacker.ru
To: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	devel@driverdev.osuosl.org,
	Andreas Dilger <andreas.dilger@intel.com>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	Ann Koehler <amk@cray.com>, Oleg Drokin <oleg.drokin@intel.com>
Subject: [PATCH 14/19] staging/lustre/obdclass: Eliminate hash bucket scans in lu_cache_shrink
Date: Mon, 14 Sep 2015 18:41:30 -0400	[thread overview]
Message-ID: <1442270495-1655259-15-git-send-email-green@linuxhacker.ru> (raw)
In-Reply-To: <1442270495-1655259-1-git-send-email-green@linuxhacker.ru>

From: Ann Koehler <amk@cray.com>

The lu_cache_shrink slab shrinker is too slow, accounting for > 90% of
the time spent in shrink_slab when allocating huge pages. Most of its
time is spent iterating over the buckets in each site's object hash
table to compute the number of freeable objects. This iteration is
eliminated by adding an lru length count to the lu_site struct. A
percpu counter is used to maintain the lru length, so that the
lu_site does not need to be locked when an object is accessed through
the hash table. A counter is updated whenever an object is added to
or deleted from any of the hash table buckets. The number of freeable
objects is the sum of the counter values across all cpus.

Signed-off-by: Ann Koehler <amk@cray.com>
Reviewed-on: http://review.whamcloud.com/14066
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-6365
Reviewed-by: Mike Pershin <mike.pershin@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Signed-off-by: Oleg Drokin <oleg.drokin@intel.com>
---
 drivers/staging/lustre/lustre/include/lu_object.h  |  1 +
 drivers/staging/lustre/lustre/obdclass/lu_object.c | 66 ++++++++++++++--------
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
index ea13a82..96e271d 100644
--- a/drivers/staging/lustre/lustre/include/lu_object.h
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -584,6 +584,7 @@ enum {
 	LU_SS_CACHE_RACE,
 	LU_SS_CACHE_DEATH_RACE,
 	LU_SS_LRU_PURGED,
+	LU_SS_LRU_LEN,	/* # of objects in lsb_lru lists */
 	LU_SS_LAST_STAT
 };
 
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
index 4f7899f..c892e82 100644
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -59,6 +59,7 @@
 #include <linux/list.h>
 
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 
 /**
  * Decrease reference counter on object. If last reference is freed, return
@@ -126,6 +127,9 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
 		LASSERT(list_empty(&top->loh_lru));
 		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
 		bkt->lsb_lru_len++;
+		lprocfs_counter_incr(site->ls_stats, LU_SS_LRU_LEN);
+		CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, lru_len: %ld\n",
+		       o, site->ls_obj_hash, bkt, bkt->lsb_lru_len);
 		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
 		return;
 	}
@@ -174,16 +178,18 @@ void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
 	top = o->lo_header;
 	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
 	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
-		struct cfs_hash *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+		struct lu_site *site = o->lo_dev->ld_site;
+		struct cfs_hash *obj_hash = site->ls_obj_hash;
 		struct cfs_hash_bd bd;
 
 		cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
 		if (!list_empty(&top->loh_lru)) {
 			struct lu_site_bkt_data *bkt;
 
-		list_del_init(&top->loh_lru);
+			list_del_init(&top->loh_lru);
 			bkt = cfs_hash_bd_extra_get(obj_hash, &bd);
 			bkt->lsb_lru_len--;
+			lprocfs_counter_decr(site->ls_stats, LU_SS_LRU_LEN);
 		}
 		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
 		cfs_hash_bd_unlock(obj_hash, &bd, 1);
@@ -355,6 +361,7 @@ int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
 					       &bd2, &h->loh_hash);
 			list_move(&h->loh_lru, &dispose);
 			bkt->lsb_lru_len--;
+			lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN);
 			if (did_sth == 0)
 				did_sth = 1;
 
@@ -568,8 +575,9 @@ static struct lu_object *htable_lookup(struct lu_site *s,
 		cfs_hash_get(s->ls_obj_hash, hnode);
 		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
 		if (!list_empty(&h->loh_lru)) {
-		list_del_init(&h->loh_lru);
+			list_del_init(&h->loh_lru);
 			bkt->lsb_lru_len--;
+			lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN);
 		}
 		return lu_object_top(h);
 	}
@@ -1029,6 +1037,12 @@ int lu_site_init(struct lu_site *s, struct lu_device *top)
 			     0, "cache_death_race", "cache_death_race");
 	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
 			     0, "lru_purged", "lru_purged");
+	/*
+	 * Unlike other counters, lru_len can be decremented so
+	 * need lc_sum instead of just lc_count
+	 */
+	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_LEN,
+			     LPROCFS_CNTR_AVGMINMAX, "lru_len", "lru_len");
 
 	INIT_LIST_HEAD(&s->ls_linkage);
 	s->ls_top_dev = top;
@@ -1817,27 +1831,21 @@ static void lu_site_stats_get(struct cfs_hash *hs,
 
 
 /*
- * There exists a potential lock inversion deadlock scenario when using
- * Lustre on top of ZFS. This occurs between one of ZFS's
- * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
- * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
- * while thread B will take the ht_lock and sleep on the lu_sites_guard
- * lock. Obviously neither thread will wake and drop their respective hold
- * on their lock.
+ * lu_cache_shrink_count returns the number of cached objects that are
+ * candidates to be freed by shrink_slab(). A counter, which tracks
+ * the number of items in the site's lru, is maintained in the per cpu
+ * stats of each site. The counter is incremented when an object is added
+ * to a site's lru and decremented when one is removed. The number of
+ * free-able objects is the sum of all per cpu counters for all sites.
  *
- * To prevent this from happening we must ensure the lu_sites_guard lock is
- * not taken while down this code path. ZFS reliably does not set the
- * __GFP_FS bit in its code paths, so this can be used to determine if it
- * is safe to take the lu_sites_guard lock.
- *
- * Ideally we should accurately return the remaining number of cached
- * objects without taking the  lu_sites_guard lock, but this is not
- * possible in the current implementation.
+ * Using a per cpu counter is a compromise solution to concurrent access:
+ * lu_object_put() can update the counter without locking the site and
+ * lu_cache_shrink_count can sum the counters without locking each
+ * ls_obj_hash bucket.
  */
 static unsigned long lu_cache_shrink_count(struct shrinker *sk,
 					   struct shrink_control *sc)
 {
-	struct lu_site_stats stats;
 	struct lu_site *s;
 	struct lu_site *tmp;
 	unsigned long cached = 0;
@@ -1847,14 +1855,14 @@ static unsigned long lu_cache_shrink_count(struct shrinker *sk,
 
 	mutex_lock(&lu_sites_guard);
 	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
-		memset(&stats, 0, sizeof(stats));
-		lu_site_stats_get(s->ls_obj_hash, &stats, 0);
-		cached += stats.lss_total - stats.lss_busy;
+		cached += ls_stats_read(s->ls_stats, LU_SS_LRU_LEN);
 	}
 	mutex_unlock(&lu_sites_guard);
 
 	cached = (cached / 100) * sysctl_vfs_cache_pressure;
-	CDEBUG(D_INODE, "%ld objects cached\n", cached);
+	CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n",
+	       cached, sysctl_vfs_cache_pressure);
+
 	return cached;
 }
 
@@ -1988,6 +1996,13 @@ static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
 	struct lprocfs_counter ret;
 
 	lprocfs_stats_collect(stats, idx, &ret);
+	if (idx == LU_SS_LRU_LEN)
+		/*
+		 * protect against counter on cpu A being decremented
+		 * before counter is incremented on cpu B; unlikely
+		 */
+		return (__u32)((ret.lc_sum > 0) ? ret.lc_sum : 0);
+
 	return (__u32)ret.lc_count;
 }
 
@@ -2002,7 +2017,7 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
 	memset(&stats, 0, sizeof(stats));
 	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
 
-	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+	seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d %d\n",
 		   stats.lss_busy,
 		   stats.lss_total,
 		   stats.lss_populated,
@@ -2013,7 +2028,8 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
 		   ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
 		   ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
 		   ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
-		   ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+		   ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED),
+		   ls_stats_read(s->ls_stats, LU_SS_LRU_LEN));
 	return 0;
 }
 EXPORT_SYMBOL(lu_site_stats_print);
-- 
2.1.0

next prev parent reply	other threads:[~2015-09-14 22:42 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-09-14 22:41 [PATCH 00/19] Lustre fixes green
2015-09-14 22:41 ` [PATCH 01/19] staging/lustre/lnet: Reenable lnet router debugfs green
2015-09-14 22:41 ` [PATCH 02/19] staging/lustre/obdclass: reorganize busy object accounting green
2015-09-14 22:41 ` [PATCH 03/19] staging/lustre/llite: cleanup open handle for client open failure green
2015-09-14 22:41 ` [PATCH 04/19] staging/lustre/llite: strengthen checks for hsm flags and archive id green
2015-09-14 22:41 ` [PATCH 05/19] staging/lustre/ptlrpc: remove LUSTRE_MSG_MAGIC_V1 support green
2015-09-14 22:41 ` [PATCH 06/19] staging/lustre/lmv: fix potential null pointer dereference green
2015-09-15 13:26   ` Trevor Woerner
2015-09-15 13:57     ` Oleg Drokin
2015-09-14 22:41 ` [PATCH 07/19] staging/lustre/llite: deny non-root user for changelog operations green
2015-09-14 22:41 ` [PATCH 08/19] staging/lustre/o2iblnd: connection refcount fix for kiblnd_post_rx green
2015-09-14 22:41 ` [PATCH 09/19] staging/lustre/osc: LBUG in osc_lru_reclaim green
2015-09-14 22:41 ` [PATCH 10/19] staging/lustre/libcfs: minor fix in cfs_hash_for_each_relax() green
2015-09-14 22:41 ` [PATCH 11/19] staging/lustre/lnet: fix deadloop in ksocknal_push green
2015-09-14 22:41 ` [PATCH 12/19] staging/lustre/o2iblnd: wrong uses of kib_tx_t::tx_nfrags green
2015-09-14 22:41 ` [PATCH 13/19] staging/lustre/llite: ASSERTION( atomic_read(&d->ld_ref) == 0 ) failed green
2015-09-14 22:41 ` green [this message]
2015-09-14 22:41 ` [PATCH 15/19] staging/lustre: Remove unused MAY_ constants green
2015-09-14 22:41 ` [PATCH 16/19] staging/lustre/osc: use global osc_rq_pool to reduce memory usage green
2015-09-14 22:41 ` [PATCH 17/19] staging/lustre/o2iblnd: leak cmid in kiblnd_dev_need_failover green
2015-09-14 22:41 ` [PATCH 18/19] staging/lustre/libcfs: remove unused cfs_timer_done green
2015-09-14 22:41 ` [PATCH 19/19] staging/lustre/ptlrpc: make ptlrpcd threads cpt-aware green

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:ea13a82 dfblob:96e271d dfblob:4f7899f dfblob:c892e82 )
 OR (
bs:"[PATCH 14/19] staging/lustre/obdclass: Eliminate hash bucket scans in lu_cache_shrink" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1442270495-1655259-15-git-send-email-green@linuxhacker.ru \
    --to=green@linuxhacker.ru \
    --cc=amk@cray.com \
    --cc=andreas.dilger@intel.com \
    --cc=devel@driverdev.osuosl.org \
    --cc=gregkh@linuxfoundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=oleg.drokin@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox