- * [PATCH v6 1/7] fs/dcache: Track & report number of negative dentries
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
@ 2018-07-06 19:32 ` Waiman Long
  2018-07-06 19:32 ` [PATCH v6 2/7] fs/dcache: Add sysctl parameter neg-dentry-pc as a soft limit on " Waiman Long
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-06 19:32 UTC (permalink / raw)
  To: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C), Waiman Long
The current dentry number tracking code doesn't distinguish between
positive & negative dentries. It just reports the total number of
dentries in the LRU lists.
As excessive number of negative dentries can have an impact on system
performance, it will be wise to track the number of positive and
negative dentries separately.
This patch adds tracking for the total number of negative dentries in
the system LRU lists and reports it in the /proc/sys/fs/dentry-state
file.  The number of positive dentries in the LRU lists can be found
by subtracting the number of negative dentries from the total.
Signed-off-by: Waiman Long <longman@redhat.com>
---
 Documentation/sysctl/fs.txt | 19 +++++++++++++------
 fs/dcache.c                 | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dcache.h      |  7 ++++---
 3 files changed, 62 insertions(+), 9 deletions(-)
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 6c00c1e..a8e3f1f 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -61,19 +61,26 @@ struct {
         int nr_unused;
         int age_limit;         /* age in seconds */
         int want_pages;        /* pages requested by system */
-        int dummy[2];
+        int nr_negative;       /* # of unused negative dentries */
+        int dummy;
 } dentry_stat = {0, 0, 45, 0,};
--------------------------------------------------------------- 
+--------------------------------------------------------------
+
+Dentries are dynamically allocated and deallocated.
+
+nr_dentry shows the total number of dentries allocated (active
++ unused). nr_unused shows the number of dentries that are not
+actively used, but are saved in the LRU list for future reuse.
 
-Dentries are dynamically allocated and deallocated, and
-nr_dentry seems to be 0 all the time. Hence it's safe to
-assume that only nr_unused, age_limit and want_pages are
-used. Nr_unused seems to be exactly what its name says.
 Age_limit is the age in seconds after which dcache entries
 can be reclaimed when memory is short and want_pages is
 nonzero when shrink_dcache_pages() has been called and the
 dcache isn't pruned yet.
 
+nr_negative shows the number of unused dentries that are also
+negative dentries which do not mapped to actual files if negative
+dentries tracking is enabled.
+
 ==============================================================
 
 dquot-max & dquot-nr:
diff --git a/fs/dcache.c b/fs/dcache.c
index 0e8e5de..dbab6c2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -119,6 +119,7 @@ struct dentry_stat_t dentry_stat = {
 
 static DEFINE_PER_CPU(long, nr_dentry);
 static DEFINE_PER_CPU(long, nr_dentry_unused);
+static DEFINE_PER_CPU(long, nr_dentry_neg);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 
@@ -152,11 +153,22 @@ static long get_nr_dentry_unused(void)
 	return sum < 0 ? 0 : sum;
 }
 
+static long get_nr_dentry_neg(void)
+{
+	int i;
+	long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_neg, i);
+	return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = get_nr_dentry();
 	dentry_stat.nr_unused = get_nr_dentry_unused();
+	dentry_stat.nr_negative = get_nr_dentry_neg();
 	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -214,6 +226,28 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
 
 #endif
 
+static inline void __neg_dentry_dec(struct dentry *dentry)
+{
+	this_cpu_dec(nr_dentry_neg);
+}
+
+static inline void neg_dentry_dec(struct dentry *dentry)
+{
+	if (unlikely(d_is_negative(dentry)))
+		__neg_dentry_dec(dentry);
+}
+
+static inline void __neg_dentry_inc(struct dentry *dentry)
+{
+	this_cpu_inc(nr_dentry_neg);
+}
+
+static inline void neg_dentry_inc(struct dentry *dentry)
+{
+	if (unlikely(d_is_negative(dentry)))
+		__neg_dentry_inc(dentry);
+}
+
 static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
 {
 	/*
@@ -330,6 +364,8 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
 	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
 	WRITE_ONCE(dentry->d_flags, flags);
 	dentry->d_inode = NULL;
+	if (dentry->d_flags & DCACHE_LRU_LIST)
+		__neg_dentry_inc(dentry);
 }
 
 static void dentry_free(struct dentry *dentry)
@@ -397,6 +433,7 @@ static void d_lru_add(struct dentry *dentry)
 	dentry->d_flags |= DCACHE_LRU_LIST;
 	this_cpu_inc(nr_dentry_unused);
 	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+	neg_dentry_inc(dentry);
 }
 
 static void d_lru_del(struct dentry *dentry)
@@ -405,6 +442,7 @@ static void d_lru_del(struct dentry *dentry)
 	dentry->d_flags &= ~DCACHE_LRU_LIST;
 	this_cpu_dec(nr_dentry_unused);
 	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+	neg_dentry_dec(dentry);
 }
 
 static void d_shrink_del(struct dentry *dentry)
@@ -435,6 +473,7 @@ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
 	dentry->d_flags &= ~DCACHE_LRU_LIST;
 	this_cpu_dec(nr_dentry_unused);
 	list_lru_isolate(lru, &dentry->d_lru);
+	neg_dentry_dec(dentry);
 }
 
 static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
@@ -443,6 +482,7 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
 	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
 	dentry->d_flags |= DCACHE_SHRINK_LIST;
 	list_lru_isolate_move(lru, &dentry->d_lru, list);
+	neg_dentry_dec(dentry);
 }
 
 /**
@@ -1842,6 +1882,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	WARN_ON(d_in_lookup(dentry));
 
 	spin_lock(&dentry->d_lock);
+	/*
+	 * Decrement negative dentry count if it was in the LRU list.
+	 */
+	if (dentry->d_flags & DCACHE_LRU_LIST)
+		__neg_dentry_dec(dentry);
 	hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	raw_write_seqcount_begin(&dentry->d_seq);
 	__d_set_inode_and_type(dentry, inode, add_flags);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 66c6e17..6e06d91 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -62,9 +62,10 @@ struct qstr {
 struct dentry_stat_t {
 	long nr_dentry;
 	long nr_unused;
-	long age_limit;          /* age in seconds */
-	long want_pages;         /* pages requested by system */
-	long dummy[2];
+	long age_limit;		/* age in seconds */
+	long want_pages;	/* pages requested by system */
+	long nr_negative;	/* # of unused negative dentries */
+	long dummy;
 };
 extern struct dentry_stat_t dentry_stat;
 
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 49+ messages in thread
- * [PATCH v6 2/7] fs/dcache: Add sysctl parameter neg-dentry-pc as a soft limit on negative dentries
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
  2018-07-06 19:32 ` [PATCH v6 1/7] fs/dcache: Track & report number " Waiman Long
@ 2018-07-06 19:32 ` Waiman Long
  2018-07-06 19:32 ` [PATCH v6 3/7] fs/dcache: Enable automatic pruning of " Waiman Long
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-06 19:32 UTC (permalink / raw)
  To: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C), Waiman Long
A new sysctl parameter "neg-dentry-pc" is added to /proc/sys/fs whose
value represents a soft limit on the total number of negative dentries
allowable in a system as a percentage of the total system memory.
The allowable range of this new parameter is 0-10 where 0 means no
soft limit.
A warning message will be printed if the soft limit is exceeded.
Signed-off-by: Waiman Long <longman@redhat.com>
---
 Documentation/sysctl/fs.txt |   9 +++
 fs/dcache.c                 | 163 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/dcache.h      |   5 ++
 kernel/sysctl.c             |  12 ++++
 4 files changed, 185 insertions(+), 4 deletions(-)
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index a8e3f1f..7980ecb 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -32,6 +32,7 @@ Currently, these files are in /proc/sys/fs:
 - nr_open
 - overflowuid
 - overflowgid
+- neg-dentry-pc
 - pipe-user-pages-hard
 - pipe-user-pages-soft
 - protected_hardlinks
@@ -168,6 +169,14 @@ The default is 65534.
 
 ==============================================================
 
+neg-dentry-pc:
+
+This integer value specifies a soft limit to the total number of
+negative dentries allowed  in a system as a percentage of the total
+system memory available. The allowable range for this value is 0-10.
+
+==============================================================
+
 pipe-user-pages-hard:
 
 Maximum total number of pages a non-privileged user may allocate for pipes.
diff --git a/fs/dcache.c b/fs/dcache.c
index dbab6c2..175012b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -14,6 +14,8 @@
  * the dcache entry is deleted or garbage collected.
  */
 
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
 #include <linux/ratelimit.h>
 #include <linux/string.h>
 #include <linux/mm.h>
@@ -117,6 +119,38 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
+/*
+ * The sysctl parameter "neg-dentry-pc" specifies the limit for the number
+ * of negative dentries allowable in a system as a percentage of the total
+ * system memory. The default is 0% which means there is no limit and the
+ * valid range is 0-10.
+ *
+ * With a limit of 2% on a 64-bit system with 1G memory, that translated
+ * to about 100k dentries which is quite a lot.
+ *
+ * To avoid performance problem with a global counter on an SMP system,
+ * the tracking is done mostly on a per-cpu basis. The total limit is
+ * distributed in a 80/20 ratio to per-cpu counters and a global free pool.
+ *
+ * If a per-cpu counter runs out of negative dentries, it can borrow extra
+ * ones from the global free pool. If it has more than its percpu limit,
+ * the extra ones will be returned back to the global pool.
+ */
+#define NEG_DENTRY_BATCH	(1 << 8)
+
+static struct static_key limit_neg_key = STATIC_KEY_INIT_FALSE;
+static int neg_dentry_pc_old;
+int neg_dentry_pc;
+EXPORT_SYMBOL_GPL(neg_dentry_pc);
+
+static long neg_dentry_percpu_limit __read_mostly;
+static long neg_dentry_nfree_init __read_mostly; /* Free pool initial value */
+static struct {
+	raw_spinlock_t nfree_lock;
+	long nfree;			/* Negative dentry free pool */
+} ndblk ____cacheline_aligned_in_smp;
+proc_handler proc_neg_dentry_pc;
+
 static DEFINE_PER_CPU(long, nr_dentry);
 static DEFINE_PER_CPU(long, nr_dentry_unused);
 static DEFINE_PER_CPU(long, nr_dentry_neg);
@@ -160,6 +194,7 @@ static long get_nr_dentry_neg(void)
 
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_dentry_neg, i);
+	sum += neg_dentry_nfree_init - ndblk.nfree;
 	return sum < 0 ? 0 : sum;
 }
 
@@ -226,9 +261,26 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
 
 #endif
 
-static inline void __neg_dentry_dec(struct dentry *dentry)
+/*
+ * Decrement negative dentry count if applicable.
+ */
+static void __neg_dentry_dec(struct dentry *dentry)
 {
-	this_cpu_dec(nr_dentry_neg);
+	if (!static_key_enabled(&limit_neg_key)) {
+		this_cpu_dec(nr_dentry_neg);
+		return;
+	}
+
+	if (unlikely(this_cpu_dec_return(nr_dentry_neg) < 0)) {
+		long *pcnt = get_cpu_ptr(&nr_dentry_neg);
+
+		if ((*pcnt < 0) && raw_spin_trylock(&ndblk.nfree_lock)) {
+			WRITE_ONCE(ndblk.nfree, ndblk.nfree + NEG_DENTRY_BATCH);
+			*pcnt += NEG_DENTRY_BATCH;
+			raw_spin_unlock(&ndblk.nfree_lock);
+		}
+		put_cpu_ptr(&nr_dentry_neg);
+	}
 }
 
 static inline void neg_dentry_dec(struct dentry *dentry)
@@ -237,9 +289,55 @@ static inline void neg_dentry_dec(struct dentry *dentry)
 		__neg_dentry_dec(dentry);
 }
 
-static inline void __neg_dentry_inc(struct dentry *dentry)
+/*
+ * Try to decrement the negative dentry free pool by NEG_DENTRY_BATCH.
+ * The actual decrement returned by the function may be smaller.
+ */
+static long __neg_dentry_nfree_dec(long cnt)
 {
-	this_cpu_inc(nr_dentry_neg);
+	cnt = max_t(long, NEG_DENTRY_BATCH, cnt);
+	raw_spin_lock(&ndblk.nfree_lock);
+	if (ndblk.nfree < cnt)
+		cnt = (ndblk.nfree > 0) ? ndblk.nfree : 0;
+	WRITE_ONCE(ndblk.nfree, ndblk.nfree - cnt);
+	raw_spin_unlock(&ndblk.nfree_lock);
+	return cnt;
+}
+
+/*
+ * Increment negative dentry count if applicable.
+ */
+static void __neg_dentry_inc(struct dentry *dentry)
+{
+	long cnt = 0, *pcnt;
+
+	if (!static_key_enabled(&limit_neg_key)) {
+		this_cpu_inc(nr_dentry_neg);
+		return;
+	}
+
+	if (likely(this_cpu_inc_return(nr_dentry_neg) <=
+		   neg_dentry_percpu_limit))
+		return;
+
+	/*
+	 * Try to move some negative dentry quota from the global free
+	 * pool to the percpu count to allow more negative dentries to
+	 * be added to the LRU.
+	 */
+	pcnt = get_cpu_ptr(&nr_dentry_neg);
+	if ((READ_ONCE(ndblk.nfree) > 0) &&
+	    (*pcnt > neg_dentry_percpu_limit)) {
+		cnt = __neg_dentry_nfree_dec(*pcnt - neg_dentry_percpu_limit);
+		*pcnt -= cnt;
+	}
+	put_cpu_ptr(&nr_dentry_neg);
+
+	/*
+	 * Put out a warning if there are too many negative dentries.
+	 */
+	if (!cnt)
+		pr_warn_once("Too many negative dentries.");
 }
 
 static inline void neg_dentry_inc(struct dentry *dentry)
@@ -248,6 +346,61 @@ static inline void neg_dentry_inc(struct dentry *dentry)
 		__neg_dentry_inc(dentry);
 }
 
+/*
+ * Sysctl proc handler for neg_dentry_pc.
+ */
+int proc_neg_dentry_pc(struct ctl_table *ctl, int write,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	/* Rough estimate of # of dentries allocated per page */
+	const unsigned int nr_dentry_page = PAGE_SIZE/sizeof(struct dentry) - 1;
+	unsigned long cnt, new_init;
+	int ret;
+
+	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+
+	if (!write || ret || (neg_dentry_pc == neg_dentry_pc_old))
+		return ret;
+
+	/*
+	 * Disable limit_neg_key first when transitioning from neg_dentry_pc
+	 * to !neg_dentry_pc. In this case, we freeze whatever value is in
+	 * neg_dentry_nfree_init and return.
+	 */
+	if (!neg_dentry_pc && neg_dentry_pc_old) {
+		static_key_slow_dec(&limit_neg_key);
+		goto out;
+	}
+
+	raw_spin_lock(&ndblk.nfree_lock);
+
+	/* 20% in global pool & 80% in percpu free */
+	new_init = totalram_pages * nr_dentry_page * neg_dentry_pc / 500;
+	cnt = new_init * 4 / num_possible_cpus();
+	if (unlikely((cnt < 2 * NEG_DENTRY_BATCH) && neg_dentry_pc))
+		cnt = 2 * NEG_DENTRY_BATCH;
+	neg_dentry_percpu_limit = cnt;
+
+	/*
+	 * Any change in neg_dentry_nfree_init must be applied to ndblk.nfree
+	 * as well. The ndblk.nfree value may become negative if there is
+	 * a decrease in percentage.
+	 */
+	ndblk.nfree += new_init - neg_dentry_nfree_init;
+	neg_dentry_nfree_init = new_init;
+	raw_spin_unlock(&ndblk.nfree_lock);
+
+	pr_info("Negative dentry: percpu limit = %ld, free pool = %ld\n",
+		neg_dentry_percpu_limit, neg_dentry_nfree_init);
+
+	if (!neg_dentry_pc_old)
+		static_key_slow_inc(&limit_neg_key);
+out:
+	neg_dentry_pc_old = neg_dentry_pc;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(proc_neg_dentry_pc);
+
 static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
 {
 	/*
@@ -3191,6 +3344,8 @@ static void __init dcache_init(void)
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 		d_iname);
 
+	raw_spin_lock_init(&ndblk.nfree_lock);
+
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
 		return;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6e06d91..44e19d9 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -610,4 +610,9 @@ struct name_snapshot {
 void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
 void release_dentry_name_snapshot(struct name_snapshot *);
 
+/*
+ * Negative dentry related declarations.
+ */
+extern int neg_dentry_pc;
+
 #endif	/* __LINUX_DCACHE_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d9837c..b46cb35 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -114,6 +114,8 @@
 extern int sysctl_nr_trim_pages;
 #endif
 
+extern proc_handler proc_neg_dentry_pc;
+
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
@@ -125,6 +127,7 @@
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
+static int __maybe_unused ten = 10;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 static int one_thousand = 1000;
@@ -1849,6 +1852,15 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "neg-dentry-pc",
+		.data		= &neg_dentry_pc,
+		.maxlen		= sizeof(neg_dentry_pc),
+		.mode		= 0644,
+		.proc_handler	= proc_neg_dentry_pc,
+		.extra1		= &zero,
+		.extra2		= &ten,
+	},
 	{ }
 };
 
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 49+ messages in thread
- * [PATCH v6 3/7] fs/dcache: Enable automatic pruning of negative dentries
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
  2018-07-06 19:32 ` [PATCH v6 1/7] fs/dcache: Track & report number " Waiman Long
  2018-07-06 19:32 ` [PATCH v6 2/7] fs/dcache: Add sysctl parameter neg-dentry-pc as a soft limit on " Waiman Long
@ 2018-07-06 19:32 ` Waiman Long
  2018-07-06 19:32 ` [PATCH v6 4/7] fs/dcache: Spread negative dentry pruning across multiple CPUs Waiman Long
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-06 19:32 UTC (permalink / raw)
  To: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C), Waiman Long
It is not good enough to have a soft limit for the number of
negative dentries in the system and print a warning if that limit is
exceeded. We need to do something about it when this happens.
This patch enables automatic pruning of negative dentries when
neg-dentry-pc sysctl parameter is non-zero and the soft limit is going
to be exceeded.  This is done by using the workqueue API to do the
pruning gradually when a threshold is reached to minimize performance
impact on other running tasks.
The current threshold is 1/4 of the initial value of the free pool
count. Once the threshold is reached, the automatic pruning process
will be kicked in to replenish the free pool. Each pruning run will
scan 64 dentries per LRU list and can remove up to 256 negative
dentries to minimize the LRU locks hold time. The pruning rate will
be 50 Hz if the free pool count is less than 1/8 of the original and
10 Hz otherwise.
The dentry pruning operation may also free some least recently used
positive dentries.
In the unlikely event that a superblock is being umount'ed while in
negative dentry pruning mode, the umount may face an additional delay
of up to 0.1s.
This negative dentry shrinker is supposed to be run in the background
with minimal performance impact. So it does not remove excess negative
dentries as fast as the regular memory shrinker when the system is
under high memory pressure.  This negative dentry removal rate should
be enough under normal circumstances. In the extreme case that the
negative dentry generation rate is too high, both this shrinker and
the regular memory shrinker may be running at the same time when the
amount of free memory is too low.
Signed-off-by: Waiman Long <longman@redhat.com>
---
 fs/dcache.c              | 155 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/list_lru.h |   1 +
 mm/list_lru.c            |   4 +-
 3 files changed, 159 insertions(+), 1 deletion(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 175012b..ac25029 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -137,6 +137,11 @@ struct dentry_stat_t dentry_stat = {
  * the extra ones will be returned back to the global pool.
  */
 #define NEG_DENTRY_BATCH	(1 << 8)
+#define NEG_PRUNING_SIZE	(1 << 6)
+#define NEG_PRUNING_SLOW_RATE	(HZ/10)
+#define NEG_PRUNING_FAST_RATE	(HZ/50)
+#define NEG_IS_SB_UMOUNTING(sb)	\
+	unlikely(!(sb)->s_root || !((sb)->s_flags & MS_ACTIVE))
 
 static struct static_key limit_neg_key = STATIC_KEY_INIT_FALSE;
 static int neg_dentry_pc_old;
@@ -147,10 +152,18 @@ struct dentry_stat_t dentry_stat = {
 static long neg_dentry_nfree_init __read_mostly; /* Free pool initial value */
 static struct {
 	raw_spinlock_t nfree_lock;
+	int niter;			/* Pruning iteration count */
+	int lru_count;			/* Per-LRU pruning count */
+	long n_neg;			/* # of negative dentries pruned */
+	long n_pos;			/* # of positive dentries pruned */
 	long nfree;			/* Negative dentry free pool */
+	struct super_block *prune_sb;	/* Super_block for pruning */
 } ndblk ____cacheline_aligned_in_smp;
 proc_handler proc_neg_dentry_pc;
 
+static void prune_negative_dentry(struct work_struct *work);
+static DECLARE_DELAYED_WORK(prune_neg_dentry_work, prune_negative_dentry);
+
 static DEFINE_PER_CPU(long, nr_dentry);
 static DEFINE_PER_CPU(long, nr_dentry_unused);
 static DEFINE_PER_CPU(long, nr_dentry_neg);
@@ -338,6 +351,25 @@ static void __neg_dentry_inc(struct dentry *dentry)
 	 */
 	if (!cnt)
 		pr_warn_once("Too many negative dentries.");
+
+	/*
+	 * Initiate negative dentry pruning if free pool has less than
+	 * 1/4 of its initial value.
+	 */
+	if ((READ_ONCE(ndblk.nfree) < READ_ONCE(neg_dentry_nfree_init)/4) &&
+	    !READ_ONCE(ndblk.prune_sb) &&
+	    !cmpxchg(&ndblk.prune_sb, NULL, dentry->d_sb)) {
+		/*
+		 * Abort if umounting is in progress, otherwise take a
+		 * reference and move on.
+		 */
+		if (NEG_IS_SB_UMOUNTING(ndblk.prune_sb)) {
+			WRITE_ONCE(ndblk.prune_sb, NULL);
+		} else {
+			atomic_inc(&ndblk.prune_sb->s_active);
+			schedule_delayed_work(&prune_neg_dentry_work, 1);
+		}
+	}
 }
 
 static inline void neg_dentry_inc(struct dentry *dentry)
@@ -1411,6 +1443,129 @@ void shrink_dcache_sb(struct super_block *sb)
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
+/*
+ * A modified version that attempts to remove a limited number of negative
+ * dentries as well as some other non-negative dentries at the front.
+ */
+static enum lru_status dentry_negative_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+	enum lru_status	status = LRU_SKIP;
+
+	/*
+	 * Limit amount of dentry walking in each LRU list.
+	 */
+	if (ndblk.lru_count >= NEG_PRUNING_SIZE) {
+		ndblk.lru_count = 0;
+		return LRU_STOP;
+	}
+	ndblk.lru_count++;
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	/*
+	 * Referenced dentries are still in use. If they have active
+	 * counts, just remove them from the LRU. Otherwise give them
+	 * another pass through the LRU.
+	 */
+	if (dentry->d_lockref.count) {
+		d_lru_isolate(lru, dentry);
+		status = LRU_REMOVED;
+		goto out;
+	}
+
+	/*
+	 * Dentries with reference bit on are moved back to the tail.
+	 */
+	if (dentry->d_flags & DCACHE_REFERENCED) {
+		dentry->d_flags &= ~DCACHE_REFERENCED;
+		status = LRU_ROTATE;
+		goto out;
+	}
+
+	status = LRU_REMOVED;
+	d_lru_shrink_move(lru, dentry, freeable);
+	if (d_is_negative(dentry))
+		ndblk.n_neg++;
+out:
+	spin_unlock(&dentry->d_lock);
+	return status;
+}
+
+/*
+ * A workqueue function to prune negative dentry.
+ *
+ * The pruning is done gradually over time so as to have as little
+ * performance impact as possible.
+ */
+static void prune_negative_dentry(struct work_struct *work)
+{
+	int freed, last_n_neg;
+	long nfree;
+	struct super_block *sb = READ_ONCE(ndblk.prune_sb);
+	LIST_HEAD(dispose);
+
+	if (!sb)
+		return;
+	if (NEG_IS_SB_UMOUNTING(sb) || !READ_ONCE(neg_dentry_pc))
+		goto stop_pruning;
+
+	ndblk.niter++;
+	ndblk.lru_count = 0;
+	last_n_neg = ndblk.n_neg;
+	freed = list_lru_walk(&sb->s_dentry_lru, dentry_negative_lru_isolate,
+			      &dispose, NEG_DENTRY_BATCH);
+
+	if (freed)
+		shrink_dentry_list(&dispose);
+	ndblk.n_pos += freed - (ndblk.n_neg - last_n_neg);
+
+	/*
+	 * Continue delayed pruning until negative dentry free pool is at
+	 * least 1/2 of the initial value, the super_block has no more
+	 * negative dentries left at the front, or unmounting is in
+	 * progress.
+	 *
+	 * The pruning rate depends on the size of the free pool. The
+	 * faster rate is used when there is less than 1/8 left.
+	 * Otherwise, the slower rate will be used.
+	 */
+	nfree = READ_ONCE(ndblk.nfree);
+	if ((ndblk.n_neg == last_n_neg) ||
+	    (nfree >= neg_dentry_nfree_init/2) || NEG_IS_SB_UMOUNTING(sb))
+		goto stop_pruning;
+
+	schedule_delayed_work(&prune_neg_dentry_work,
+			     (nfree < neg_dentry_nfree_init/8)
+			     ? NEG_PRUNING_FAST_RATE : NEG_PRUNING_SLOW_RATE);
+	return;
+
+stop_pruning:
+#ifdef CONFIG_DEBUG_KERNEL
+	/*
+	 * Report large negative dentry pruning event.
+	 */
+	if (ndblk.n_neg > NEG_PRUNING_SIZE) {
+		pr_info("Negative dentry pruning (SB=%s):\n\t"
+			"%d iterations, %ld/%ld neg/pos dentries freed.\n",
+			ndblk.prune_sb->s_id, ndblk.niter, ndblk.n_neg,
+			ndblk.n_pos);
+	}
+#endif
+	ndblk.niter = 0;
+	ndblk.n_neg = ndblk.n_pos = 0;
+	deactivate_super(sb);
+	WRITE_ONCE(ndblk.prune_sb, NULL);
+}
+
 /**
  * enum d_walk_ret - action to talke during tree walk
  * @D_WALK_CONTINUE:	contrinue walk
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 96def9d..a9598a0 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -23,6 +23,7 @@ enum lru_status {
 	LRU_SKIP,		/* item cannot be locked, skip */
 	LRU_RETRY,		/* item not freeable. May drop the lock
 				   internally, but has to return locked. */
+	LRU_STOP,		/* stop walking the list */
 };
 
 struct list_lru_one {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fcfb6c8..2ee5d3a 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -246,11 +246,13 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
 			 */
 			assert_spin_locked(&nlru->lock);
 			goto restart;
+		case LRU_STOP:
+			goto out;
 		default:
 			BUG();
 		}
 	}
-
+out:
 	spin_unlock(&nlru->lock);
 	return isolated;
 }
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 49+ messages in thread
- * [PATCH v6 4/7] fs/dcache: Spread negative dentry pruning across multiple CPUs
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
                   ` (2 preceding siblings ...)
  2018-07-06 19:32 ` [PATCH v6 3/7] fs/dcache: Enable automatic pruning of " Waiman Long
@ 2018-07-06 19:32 ` Waiman Long
  2018-07-06 19:32 ` [PATCH v6 5/7] fs/dcache: Add negative dentries to LRU head initially Waiman Long
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-06 19:32 UTC (permalink / raw)
  To: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C), Waiman Long
Doing negative dentry pruning using schedule_delayed_work() will
typically concentrate the pruning effort on one particular CPU. That is
not fair to the tasks running on that CPU. In addition, it is possible
that one CPU can have all its negative dentries pruned away while the
others can still have more negative dentries than the percpu limit.
To be fair, negative dentries pruning is now done across all the online
CPUs, if they all have close to the percpu limit of negative dentries.
Signed-off-by: Waiman Long <longman@redhat.com>
---
 fs/dcache.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index ac25029..3be9246 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -367,7 +367,8 @@ static void __neg_dentry_inc(struct dentry *dentry)
 			WRITE_ONCE(ndblk.prune_sb, NULL);
 		} else {
 			atomic_inc(&ndblk.prune_sb->s_active);
-			schedule_delayed_work(&prune_neg_dentry_work, 1);
+			schedule_delayed_work_on(smp_processor_id(),
+						&prune_neg_dentry_work, 1);
 		}
 	}
 }
@@ -1508,8 +1509,9 @@ static enum lru_status dentry_negative_lru_isolate(struct list_head *item,
  */
 static void prune_negative_dentry(struct work_struct *work)
 {
+	int cpu = smp_processor_id();
 	int freed, last_n_neg;
-	long nfree;
+	long nfree, excess;
 	struct super_block *sb = READ_ONCE(ndblk.prune_sb);
 	LIST_HEAD(dispose);
 
@@ -1543,9 +1545,40 @@ static void prune_negative_dentry(struct work_struct *work)
 	    (nfree >= neg_dentry_nfree_init/2) || NEG_IS_SB_UMOUNTING(sb))
 		goto stop_pruning;
 
-	schedule_delayed_work(&prune_neg_dentry_work,
-			     (nfree < neg_dentry_nfree_init/8)
-			     ? NEG_PRUNING_FAST_RATE : NEG_PRUNING_SLOW_RATE);
+	/*
+	 * If the negative dentry count in the current cpu is less than the
+	 * per_cpu limit, schedule the pruning in the next cpu if it has
+	 * more negative dentries. This will make the negative dentry count
+	 * reduction spread more evenly across multiple per-cpu counters.
+	 */
+	excess = neg_dentry_percpu_limit - __this_cpu_read(nr_dentry_neg);
+	if (excess > 0) {
+		int next_cpu = cpumask_next(cpu, cpu_online_mask);
+
+		if (next_cpu >= nr_cpu_ids)
+			next_cpu = cpumask_first(cpu_online_mask);
+		if (per_cpu(nr_dentry_neg, next_cpu) >
+		    __this_cpu_read(nr_dentry_neg)) {
+			cpu = next_cpu;
+
+			/*
+			 * Transfer some of the excess negative dentry count
+			 * to the free pool if the current percpu pool is less
+			 * than 3/4 of the limit.
+			 */
+			if ((excess > neg_dentry_percpu_limit/4) &&
+			    raw_spin_trylock(&ndblk.nfree_lock)) {
+				WRITE_ONCE(ndblk.nfree,
+					   ndblk.nfree + NEG_DENTRY_BATCH);
+				__this_cpu_add(nr_dentry_neg, NEG_DENTRY_BATCH);
+				raw_spin_unlock(&ndblk.nfree_lock);
+			}
+		}
+	}
+
+	schedule_delayed_work_on(cpu, &prune_neg_dentry_work,
+			(nfree < neg_dentry_nfree_init/8)
+			? NEG_PRUNING_FAST_RATE : NEG_PRUNING_SLOW_RATE);
 	return;
 
 stop_pruning:
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 49+ messages in thread
- * [PATCH v6 5/7] fs/dcache: Add negative dentries to LRU head initially
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
                   ` (3 preceding siblings ...)
  2018-07-06 19:32 ` [PATCH v6 4/7] fs/dcache: Spread negative dentry pruning across multiple CPUs Waiman Long
@ 2018-07-06 19:32 ` Waiman Long
  2018-07-06 19:32 ` [PATCH v6 6/7] fs/dcache: Allow optional enforcement of negative dentry limit Waiman Long
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-06 19:32 UTC (permalink / raw)
  To: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C), Waiman Long
For negative dentries that are accessed once and never reused again,
there is not much value in putting the dentries at the tail of the LRU
list and keep it for a long time. So a new DCACHE_LRU_HEAD flag is added
to a negative dentry when it is initially created. When such a dentry
is added to the LRU, it will be added to the head so that it will be
the first to go when a shrinker is running. The flag is then cleared
after the LRU list addition. So if that dentry is accessed again,
it will be put back to the tail like the rest of the dentries.
By running a negative dentry generator for a certain period of time
and let the automatic pruning process to run through its course, the
number of negative and positive dentries discarded were:
	681 iterations, 43503/60 neg/pos dentries freed.
	45115 iterations, 2884992/64 neg/pos dentries freed.
So the number of positive dentries discarded is only 124.
Without this patch, the number of negative and positive dentries
discarded would be:
	20 iterations, 598/483 neg/pos dentries freed.
	60 iterations, 2977/517 neg/pos dentries freed.
	31 iterations, 1060/599 neg/pos dentries freed.
	11 iterations, 447/103 neg/pos dentries freed.
	17 iterations, 682/304 neg/pos dentries freed.
	17 iterations, 555/196 neg/pos dentries freed.
	33008 iterations, 2094860/7624 neg/pos dentries freed.
It can be seen that a lot more positive dentries would have been lost
as collateral damage in this case.
Suggested-by: Larry Woodman <lwoodman@redhat.com>
Signed-off-by: Waiman Long <longman@redhat.com>
---
 fs/dcache.c              | 18 +++++++++++++++++-
 include/linux/dcache.h   |  1 +
 include/linux/list_lru.h | 17 +++++++++++++++++
 mm/list_lru.c            | 19 +++++++++++++++++--
 4 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 3be9246..ec007ac 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -615,10 +615,23 @@ static void dentry_unlink_inode(struct dentry * dentry)
 #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
 static void d_lru_add(struct dentry *dentry)
 {
+	int ret;
+
 	D_FLAG_VERIFY(dentry, 0);
 	dentry->d_flags |= DCACHE_LRU_LIST;
 	this_cpu_inc(nr_dentry_unused);
-	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+	if (unlikely(dentry->d_flags & DCACHE_LRU_HEAD)) {
+		/*
+		 * Add to the head once, it will be added to the tail
+		 * next time.
+		 */
+		ret = list_lru_add_head(&dentry->d_sb->s_dentry_lru,
+					&dentry->d_lru);
+		dentry->d_flags &= ~DCACHE_LRU_HEAD;
+	} else {
+		ret = list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru);
+	}
+	WARN_ON_ONCE(!ret);
 	neg_dentry_inc(dentry);
 }
 
@@ -2988,6 +3001,9 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 		__d_set_inode_and_type(dentry, inode, add_flags);
 		raw_write_seqcount_end(&dentry->d_seq);
 		fsnotify_update_flags(dentry);
+	} else {
+		/* It is a negative dentry, add it to LRU head initially. */
+		dentry->d_flags |= DCACHE_LRU_HEAD;
 	}
 	__d_rehash(dentry);
 	if (dir)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 44e19d9..317e040 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -215,6 +215,7 @@ struct dentry_operations {
 #define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
 #define DCACHE_ENCRYPTED_WITH_KEY	0x02000000 /* dir is encrypted with a valid key */
 #define DCACHE_OP_REAL			0x04000000
+#define DCACHE_LRU_HEAD 		0x08000000 /* Add to LRU head initially */
 
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
 #define DCACHE_DENTRY_CURSOR		0x20000000
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index a9598a0..7856435 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -87,6 +87,23 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
 bool list_lru_add(struct list_lru *lru, struct list_head *item);
 
 /**
+ * list_lru_add_head: add an element to the lru list's head
+ * @list_lru: the lru pointer
+ * @item: the item to be added.
+ *
+ * This is similar to list_lru_add(). The only difference is the location
+ * where the new item will be added. The list_lru_add() function will add
+ * the new item to the tail as it is the most recently used one. The
+ * list_lru_add_head() will add the new item into the head so that it
+ * will the first to go if a shrinker is running. So this function should
+ * only be used for less important item that can be the first to go if
+ * the system is under memory pressure.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add_head(struct list_lru *lru, struct list_head *item);
+
+/**
  * list_lru_del: delete an element to the lru list
  * @list_lru: the lru pointer
  * @item: the item to be deleted.
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 2ee5d3a..4ea3c1e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -107,7 +107,8 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru)
 }
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
-bool list_lru_add(struct list_lru *lru, struct list_head *item)
+static inline bool __list_lru_add(struct list_lru *lru, struct list_head *item,
+				  const bool add_tail)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
@@ -116,7 +117,10 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 	spin_lock(&nlru->lock);
 	if (list_empty(item)) {
 		l = list_lru_from_kmem(nlru, item);
-		list_add_tail(item, &l->list);
+		if (add_tail)
+			list_add_tail(item, &l->list);
+		else
+			list_add(item, &l->list);
 		l->nr_items++;
 		nlru->nr_items++;
 		spin_unlock(&nlru->lock);
@@ -125,8 +129,19 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
 	spin_unlock(&nlru->lock);
 	return false;
 }
+
+bool list_lru_add(struct list_lru *lru, struct list_head *item)
+{
+	return __list_lru_add(lru, item, true);
+}
 EXPORT_SYMBOL_GPL(list_lru_add);
 
+bool list_lru_add_head(struct list_lru *lru, struct list_head *item)
+{
+	return __list_lru_add(lru, item, false);
+}
+EXPORT_SYMBOL_GPL(list_lru_add_head);
+
 bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 49+ messages in thread
- * [PATCH v6 6/7] fs/dcache: Allow optional enforcement of negative dentry limit
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
                   ` (4 preceding siblings ...)
  2018-07-06 19:32 ` [PATCH v6 5/7] fs/dcache: Add negative dentries to LRU head initially Waiman Long
@ 2018-07-06 19:32 ` Waiman Long
  2018-07-06 19:32 ` [PATCH v6 7/7] fs/dcache: Allow deconfiguration of negative dentry code to reduce kernel size Waiman Long
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-06 19:32 UTC (permalink / raw)
  To: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C), Waiman Long
If a rogue application that generates a large number of negative
dentries is running, the automatic negative dentries pruning process
may not be fast enough to clear up the negative dentries in time. In
this case, it is possible that negative dentries will use up most
of the available memory in the system when that application is not
under the control of a memory cgroup that limit kernel memory.
The lack of available memory may significantly impact the operation
of other applications running in the system. It will slow down system
performance and may even work as part of a DoS attack on the system.
To allow system administrators the option to prevent this extreme
situation from happening, a new "neg-dentry-enforce" sysctl parameter
is now added which can be set to to enforce the negative dentry soft
limit set in "neg-dentry-pc" so that it becomes a hard limit. When the
limit is enforced, extra negative dentries that exceed the limit will
be killed after use instead of leaving them in the LRU.
Signed-off-by: Waiman Long <longman@redhat.com>
---
 Documentation/sysctl/fs.txt | 10 ++++++++++
 fs/dcache.c                 | 42 +++++++++++++++++++++++++++++++++++-------
 include/linux/dcache.h      |  2 ++
 kernel/sysctl.c             |  9 +++++++++
 4 files changed, 56 insertions(+), 7 deletions(-)
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 7980ecb..3a3c8fa 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -32,6 +32,7 @@ Currently, these files are in /proc/sys/fs:
 - nr_open
 - overflowuid
 - overflowgid
+- neg-dentry-enforce
 - neg-dentry-pc
 - pipe-user-pages-hard
 - pipe-user-pages-soft
@@ -169,6 +170,15 @@ The default is 65534.
 
 ==============================================================
 
+neg-dentry-enforce:
+
+The file neg-dentry-enforce, if present, contains a boolean flag (0 or
+1) indicating if the negative dentries limit set by the "neg_dentry_pc"
+sysctl parameter should be enforced or not.  If enforced, excess negative
+dentries over the limit will be killed immediately after use.
+
+==============================================================
+
 neg-dentry-pc:
 
 This integer value specifies a soft limit to the total number of
diff --git a/fs/dcache.c b/fs/dcache.c
index ec007ac..43d49d7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -147,6 +147,8 @@ struct dentry_stat_t dentry_stat = {
 static int neg_dentry_pc_old;
 int neg_dentry_pc;
 EXPORT_SYMBOL_GPL(neg_dentry_pc);
+int neg_dentry_enforce;	/* Enforce the negative dentry limit */
+EXPORT_SYMBOL_GPL(neg_dentry_enforce);
 
 static long neg_dentry_percpu_limit __read_mostly;
 static long neg_dentry_nfree_init __read_mostly; /* Free pool initial value */
@@ -161,6 +163,7 @@ struct dentry_stat_t dentry_stat = {
 } ndblk ____cacheline_aligned_in_smp;
 proc_handler proc_neg_dentry_pc;
 
+static void d_lru_del(struct dentry *dentry);
 static void prune_negative_dentry(struct work_struct *work);
 static DECLARE_DELAYED_WORK(prune_neg_dentry_work, prune_negative_dentry);
 
@@ -319,8 +322,12 @@ static long __neg_dentry_nfree_dec(long cnt)
 
 /*
  * Increment negative dentry count if applicable.
+ *
+ * The retain flag will only be set when calling from
+ * __d_clear_type_and_inode() so as to retain the entry even
+ * if the negative dentry limit has been exceeded.
  */
-static void __neg_dentry_inc(struct dentry *dentry)
+static void __neg_dentry_inc(struct dentry *dentry, bool retain)
 {
 	long cnt = 0, *pcnt;
 
@@ -347,10 +354,25 @@ static void __neg_dentry_inc(struct dentry *dentry)
 	put_cpu_ptr(&nr_dentry_neg);
 
 	/*
-	 * Put out a warning if there are too many negative dentries.
+	 * Put out a warning if there are too many negative dentries or
+	 * kill it by removing it from the LRU and set the
+	 * DCACHE_KILL_NEGATIVE flag if the enforce option is on.
 	 */
-	if (!cnt)
+	if (!cnt) {
+		if (neg_dentry_enforce && !retain) {
+			dentry->d_flags |= DCACHE_KILL_NEGATIVE;
+			d_lru_del(dentry);
+			/*
+			 * When the dentry is no longer in LRU, we
+			 * need to keep the reference count to 1 to
+			 * avoid problem when killing it.
+			 */
+			WARN_ON_ONCE(dentry->d_lockref.count);
+			dentry->d_lockref.count = 1;
+			return; /* Kill the dentry now */
+		}
 		pr_warn_once("Too many negative dentries.");
+	}
 
 	/*
 	 * Initiate negative dentry pruning if free pool has less than
@@ -376,7 +398,7 @@ static void __neg_dentry_inc(struct dentry *dentry)
 static inline void neg_dentry_inc(struct dentry *dentry)
 {
 	if (unlikely(d_is_negative(dentry)))
-		__neg_dentry_inc(dentry);
+		__neg_dentry_inc(dentry, false);
 }
 
 /*
@@ -551,7 +573,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
 	WRITE_ONCE(dentry->d_flags, flags);
 	dentry->d_inode = NULL;
 	if (dentry->d_flags & DCACHE_LRU_LIST)
-		__neg_dentry_inc(dentry);
+		__neg_dentry_inc(dentry, true);	/* Always retain it */
 }
 
 static void dentry_free(struct dentry *dentry)
@@ -871,16 +893,22 @@ static inline bool retain_dentry(struct dentry *dentry)
 	if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
 		return false;
 
+	if (unlikely(dentry->d_flags & DCACHE_KILL_NEGATIVE))
+		return false;
+
 	if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
 		if (dentry->d_op->d_delete(dentry))
 			return false;
 	}
 	/* retain; LRU fodder */
 	dentry->d_lockref.count--;
-	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
 		d_lru_add(dentry);
-	else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+		if (unlikely(dentry->d_flags & DCACHE_KILL_NEGATIVE))
+			return false;
+	} else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) {
 		dentry->d_flags |= DCACHE_REFERENCED;
+	}
 	return true;
 }
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 317e040..71a3315 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -219,6 +219,7 @@ struct dentry_operations {
 
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
 #define DCACHE_DENTRY_CURSOR		0x20000000
+#define DCACHE_KILL_NEGATIVE		0x40000000 /* Kill negative dentry */
 
 extern seqlock_t rename_lock;
 
@@ -615,5 +616,6 @@ struct name_snapshot {
  * Negative dentry related declarations.
  */
 extern int neg_dentry_pc;
+extern int neg_dentry_enforce;
 
 #endif	/* __LINUX_DCACHE_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b46cb35..8c008ae 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1861,6 +1861,15 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.extra1		= &zero,
 		.extra2		= &ten,
 	},
+	{
+		.procname	= "neg-dentry-enforce",
+		.data		= &neg_dentry_enforce,
+		.maxlen		= sizeof(neg_dentry_enforce),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };
 
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 49+ messages in thread
- * [PATCH v6 7/7] fs/dcache: Allow deconfiguration of negative dentry code to reduce kernel size
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
                   ` (5 preceding siblings ...)
  2018-07-06 19:32 ` [PATCH v6 6/7] fs/dcache: Allow optional enforcement of negative dentry limit Waiman Long
@ 2018-07-06 19:32 ` Waiman Long
  2018-07-06 21:54   ` Eric Biggers
  2018-07-06 22:28 ` [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Al Viro
  2018-07-09  8:19 ` Michal Hocko
  8 siblings, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-06 19:32 UTC (permalink / raw)
  To: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook
  Cc: linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C), Waiman Long
The tracking and limit of negative dentries in a filesystem is a useful
addition. However, for users who want to reduce the kernel size as much
as possible, this feature will probably be on the chopping block. To
suit those users, a default-y config option DCACHE_LIMIT_NEG_ENTRY is
added so that the negative dentry tracking and limiting code can be
configured out, if necessary.
Signed-off-by: Waiman Long <longman@redhat.com>
---
 fs/Kconfig             | 10 ++++++++++
 fs/dcache.c            | 33 ++++++++++++++++++++++++++++++++-
 include/linux/dcache.h |  2 ++
 kernel/sysctl.c        |  2 ++
 4 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a6..b521941 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -113,6 +113,16 @@ source "fs/autofs/Kconfig"
 source "fs/fuse/Kconfig"
 source "fs/overlayfs/Kconfig"
 
+#
+# Track and limit the number of negative dentries allowed in the system.
+#
+config DCACHE_LIMIT_NEG_ENTRY
+	bool "Track & limit negative dcache entries"
+	default y
+	help
+	  This option enables the tracking and limiting of the total
+	  number of negative dcache entries allowable in the filesystem.
+
 menu "Caches"
 
 source "fs/fscache/Kconfig"
diff --git a/fs/dcache.c b/fs/dcache.c
index 43d49d7..d00761e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -143,6 +143,7 @@ struct dentry_stat_t dentry_stat = {
 #define NEG_IS_SB_UMOUNTING(sb)	\
 	unlikely(!(sb)->s_root || !((sb)->s_flags & MS_ACTIVE))
 
+#ifdef CONFIG_DCACHE_LIMIT_NEG_ENTRY
 static struct static_key limit_neg_key = STATIC_KEY_INIT_FALSE;
 static int neg_dentry_pc_old;
 int neg_dentry_pc;
@@ -166,10 +167,11 @@ struct dentry_stat_t dentry_stat = {
 static void d_lru_del(struct dentry *dentry);
 static void prune_negative_dentry(struct work_struct *work);
 static DECLARE_DELAYED_WORK(prune_neg_dentry_work, prune_negative_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_neg);
+#endif /* CONFIG_DCACHE_LIMIT_NEG_ENTRY */
 
 static DEFINE_PER_CPU(long, nr_dentry);
 static DEFINE_PER_CPU(long, nr_dentry_unused);
-static DEFINE_PER_CPU(long, nr_dentry_neg);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 
@@ -203,6 +205,7 @@ static long get_nr_dentry_unused(void)
 	return sum < 0 ? 0 : sum;
 }
 
+#ifdef CONFIG_DCACHE_LIMIT_NEG_ENTRY
 static long get_nr_dentry_neg(void)
 {
 	int i;
@@ -213,6 +216,9 @@ static long get_nr_dentry_neg(void)
 	sum += neg_dentry_nfree_init - ndblk.nfree;
 	return sum < 0 ? 0 : sum;
 }
+#else
+static long get_nr_dentry_neg(void)	{ return 0; }
+#endif
 
 int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
@@ -277,6 +283,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
 
 #endif
 
+#ifdef CONFIG_DCACHE_LIMIT_NEG_ENTRY
 /*
  * Decrement negative dentry count if applicable.
  */
@@ -455,6 +462,26 @@ int proc_neg_dentry_pc(struct ctl_table *ctl, int write,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(proc_neg_dentry_pc);
+#else /* CONFIG_DCACHE_LIMIT_NEG_ENTRY */
+
+static inline void __neg_dentry_dec(struct dentry *dentry)
+{
+}
+
+static inline void neg_dentry_dec(struct dentry *dentry)
+{
+}
+
+static inline void __neg_dentry_inc(struct dentry *dentry, bool retain)
+{
+}
+
+static inline void neg_dentry_inc(struct dentry *dentry)
+{
+}
+
+#endif /* CONFIG_DCACHE_LIMIT_NEG_ENTRY */
+
 
 static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
 {
@@ -1485,6 +1512,7 @@ void shrink_dcache_sb(struct super_block *sb)
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
+#ifdef CONFIG_DCACHE_LIMIT_NEG_ENTRY
 /*
  * A modified version that attempts to remove a limited number of negative
  * dentries as well as some other non-negative dentries at the front.
@@ -1639,6 +1667,7 @@ static void prune_negative_dentry(struct work_struct *work)
 	deactivate_super(sb);
 	WRITE_ONCE(ndblk.prune_sb, NULL);
 }
+#endif /* CONFIG_DCACHE_LIMIT_NEG_ENTRY */
 
 /**
  * enum d_walk_ret - action to talke during tree walk
@@ -3576,7 +3605,9 @@ static void __init dcache_init(void)
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 		d_iname);
 
+#ifdef CONFIG_DCACHE_LIMIT_NEG_ENTRY
 	raw_spin_lock_init(&ndblk.nfree_lock);
+#endif
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 71a3315..27ffc35 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -612,10 +612,12 @@ struct name_snapshot {
 void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
 void release_dentry_name_snapshot(struct name_snapshot *);
 
+#ifdef CONFIG_DCACHE_LIMIT_NEG_ENTRY
 /*
  * Negative dentry related declarations.
  */
 extern int neg_dentry_pc;
 extern int neg_dentry_enforce;
+#endif
 
 #endif	/* __LINUX_DCACHE_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8c008ae..875d5ef 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1852,6 +1852,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+#ifdef CONFIG_DCACHE_LIMIT_NEG_ENTRY
 	{
 		.procname	= "neg-dentry-pc",
 		.data		= &neg_dentry_pc,
@@ -1870,6 +1871,7 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#endif
 	{ }
 };
 
-- 
1.8.3.1
^ permalink raw reply related	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 7/7] fs/dcache: Allow deconfiguration of negative dentry code to reduce kernel size
  2018-07-06 19:32 ` [PATCH v6 7/7] fs/dcache: Allow deconfiguration of negative dentry code to reduce kernel size Waiman Long
@ 2018-07-06 21:54   ` Eric Biggers
  0 siblings, 0 replies; 49+ messages in thread
From: Eric Biggers @ 2018-07-06 21:54 UTC (permalink / raw)
  To: Waiman Long
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On Fri, Jul 06, 2018 at 03:32:52PM -0400, Waiman Long wrote:
> The tracking and limit of negative dentries in a filesystem is a useful
> addition. However, for users who want to reduce the kernel size as much
> as possible, this feature will probably be on the chopping block. To
> suit those users, a default-y config option DCACHE_LIMIT_NEG_ENTRY is
> added so that the negative dentry tracking and limiting code can be
> configured out, if necessary.
> 
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>  fs/Kconfig             | 10 ++++++++++
>  fs/dcache.c            | 33 ++++++++++++++++++++++++++++++++-
>  include/linux/dcache.h |  2 ++
>  kernel/sysctl.c        |  2 ++
>  4 files changed, 46 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/Kconfig b/fs/Kconfig
> index ac474a6..b521941 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -113,6 +113,16 @@ source "fs/autofs/Kconfig"
>  source "fs/fuse/Kconfig"
>  source "fs/overlayfs/Kconfig"
>  
> +#
> +# Track and limit the number of negative dentries allowed in the system.
> +#
> +config DCACHE_LIMIT_NEG_ENTRY
> +	bool "Track & limit negative dcache entries"
> +	default y
> +	help
> +	  This option enables the tracking and limiting of the total
> +	  number of negative dcache entries allowable in the filesystem.
> +
If there's going to be a config option for this, it should be documented
properly.  I.e., why would someone want to turn this on, or turn it off?  What
are the tradeoffs?  If unsure, should the user say y or n?
I think there are way too many config options that were meaningful to the person
writing the code but aren't meaningful to people configuring the kernel.
- Eric
^ permalink raw reply	[flat|nested] 49+ messages in thread 
 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
                   ` (6 preceding siblings ...)
  2018-07-06 19:32 ` [PATCH v6 7/7] fs/dcache: Allow deconfiguration of negative dentry code to reduce kernel size Waiman Long
@ 2018-07-06 22:28 ` Al Viro
  2018-07-07  3:02   ` Waiman Long
  2018-07-09  8:19 ` Michal Hocko
  8 siblings, 1 reply; 49+ messages in thread
From: Al Viro @ 2018-07-06 22:28 UTC (permalink / raw)
  To: Waiman Long
  Cc: Jonathan Corbet, Luis R. Rodriguez, Kees Cook, linux-kernel,
	linux-fsdevel, linux-mm, linux-doc, Linus Torvalds, Jan Kara,
	Paul E. McKenney, Andrew Morton, Ingo Molnar, Miklos Szeredi,
	Matthew Wilcox, Larry Woodman, James Bottomley, Wangkai (Kevin C)
On Fri, Jul 06, 2018 at 03:32:45PM -0400, Waiman Long wrote:
> With a 4.18 based kernel, the positive & negative dentries lookup rates
> (lookups per second) after initial boot on a 2-socket 24-core 48-thread
> 64GB memory system with and without the patch were as follows: `
> 
>   Metric                    w/o patch  neg_dentry_pc=0  neg_dentry_pc=1
>   ------                    ---------  ---------------  ---------------
>   Positive dentry lookup      584299       586749	   582670
>   Negative dentry lookup     1422204      1439994	  1438440
>   Negative dentry creation    643535       652194	   641841
> 
> For the lookup rate, there isn't any signifcant difference with or
> without the patch or with a zero or non-zero value of neg_dentry_pc.
Sigh...  What I *still* don't see (after all the iterations of the patchset)
is any performance data on workloads that would be likely to feel the impact.
Anything that seriously hits INCLUDE_PATH, for starters...
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-06 22:28 ` [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Al Viro
@ 2018-07-07  3:02   ` Waiman Long
  0 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-07  3:02 UTC (permalink / raw)
  To: Al Viro
  Cc: Jonathan Corbet, Luis R. Rodriguez, Kees Cook, linux-kernel,
	linux-fsdevel, linux-mm, linux-doc, Linus Torvalds, Jan Kara,
	Paul E. McKenney, Andrew Morton, Ingo Molnar, Miklos Szeredi,
	Matthew Wilcox, Larry Woodman, James Bottomley, Wangkai (Kevin C)
On 07/06/2018 06:28 PM, Al Viro wrote:
> On Fri, Jul 06, 2018 at 03:32:45PM -0400, Waiman Long wrote:
>
>> With a 4.18 based kernel, the positive & negative dentries lookup rates
>> (lookups per second) after initial boot on a 2-socket 24-core 48-thread
>> 64GB memory system with and without the patch were as follows: `
>>
>>   Metric                    w/o patch  neg_dentry_pc=0  neg_dentry_pc=1
>>   ------                    ---------  ---------------  ---------------
>>   Positive dentry lookup      584299       586749	   582670
>>   Negative dentry lookup     1422204      1439994	  1438440
>>   Negative dentry creation    643535       652194	   641841
>>
>> For the lookup rate, there isn't any signifcant difference with or
>> without the patch or with a zero or non-zero value of neg_dentry_pc.
> Sigh...  What I *still* don't see (after all the iterations of the patchset)
> is any performance data on workloads that would be likely to feel the impact.
> Anything that seriously hits INCLUDE_PATH, for starters...
I wrote a simple microbenchmark that does a lot of open() system calls
to create positive or negative dentries. I was not seeing any noticeable
performance difference as long as not too many negative dentries were
created.
Please enlighten me on how kind of performance benchmark that you would
like me to run.
Thanks,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-06 19:32 [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Waiman Long
                   ` (7 preceding siblings ...)
  2018-07-06 22:28 ` [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries Al Viro
@ 2018-07-09  8:19 ` Michal Hocko
  2018-07-09 16:01   ` Waiman Long
  8 siblings, 1 reply; 49+ messages in thread
From: Michal Hocko @ 2018-07-09  8:19 UTC (permalink / raw)
  To: Waiman Long
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On Fri 06-07-18 15:32:45, Waiman Long wrote:
[...]
> A rogue application can potentially create a large number of negative
> dentries in the system consuming most of the memory available if it
> is not under the direct control of a memory controller that enforce
> kernel memory limit.
How does this differ from other untracked allocations for untrusted
tasks in general? E.g. nothing really prevents a task to create a long
chain of unreclaimable dentries and even go to OOM potentially. Negative
dentries should be easily reclaimable on the other hand. So why does the
later needs a special treatment while the first one is ok? There are
quite some resources which allow a non privileged user to consume a lot
of memory and the memory controller is the only reliable way to mitigate
the risk.
> This patchset introduces changes to the dcache subsystem to track and
> optionally limit the number of negative dentries allowed to be created by
> background pruning of excess negative dentries or even kill it after use.
> This capability will help to limit the amount of memory that can be
> consumed by negative dentries.
How are you going to balance that between workload? What prevents a
rogue application to simply consume the limit and force all others in
the system to go slow path?
> Patch 1 tracks the number of negative dentries present in the LRU
> lists and reports it in /proc/sys/fs/dentry-state.
If anything I _think_ vmstat would benefit from this because behavior of
the memory reclaim does depend on the amount of neg. dentries.
> Patch 2 adds a "neg-dentry-pc" sysctl parameter that can be used to to
> specify a soft limit on the number of negative allowed as a percentage
> of total system memory. This parameter is 0 by default which means no
> negative dentry limiting will be performed.
percentage has turned out to be a really wrong unit for many tunables
over time. Even 1% can be just too much on really large machines.
> Patch 3 enables automatic pruning of least recently used negative
> dentries when the total number is close to the preset limit.
Please explain why this cannot be done in a standard dcache shrinking
way. I strongly suspect that you are developing yet another reclaim with
its own sets of tunable and bypassing the existing infrastructure. I
haven't read patches yet but the cover letter doesn't really explain
design much so I am only guessing.
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-09  8:19 ` Michal Hocko
@ 2018-07-09 16:01   ` Waiman Long
  2018-07-10 14:27     ` Michal Hocko
  0 siblings, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-09 16:01 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On 07/09/2018 04:19 AM, Michal Hocko wrote:
> On Fri 06-07-18 15:32:45, Waiman Long wrote:
> [...]
>> A rogue application can potentially create a large number of negative
>> dentries in the system consuming most of the memory available if it
>> is not under the direct control of a memory controller that enforce
>> kernel memory limit.
> How does this differ from other untracked allocations for untrusted
> tasks in general? E.g. nothing really prevents a task to create a long
> chain of unreclaimable dentries and even go to OOM potentially. Negative
> dentries should be easily reclaimable on the other hand. So why does the
I think all dentries are reclaimable. Yes, a rogue application or user
can create million of new files and hence dentries or consuming buffer
cache. The major difference here is the other attack can be very
noticeable and traceable. Filesystem limits may also be hit first before
the system is running out of memory. The negative dentry attack,
however, is more hidden and not easily traceable. So you won't know the
system is in trouble until it is almost running out of free memory.
> later needs a special treatment while the first one is ok? There are
> quite some resources which allow a non privileged user to consume a lot
> of memory and the memory controller is the only reliable way to mitigate
> the risk.
Yes, memory controller is the only reliable way to mitigate the risk,
but not all tasks are under the control of a memory controller with
kernel memory limit.
>> This patchset introduces changes to the dcache subsystem to track and
>> optionally limit the number of negative dentries allowed to be created by
>> background pruning of excess negative dentries or even kill it after use.
>> This capability will help to limit the amount of memory that can be
>> consumed by negative dentries.
> How are you going to balance that between workload? What prevents a
> rogue application to simply consume the limit and force all others in
> the system to go slow path?
With the current patchset, it is possible for a rogue application to
force every one else to go to slow path. One possible solution to this
is to go to the slowpath only for the newly created neg dentries, not
for those that have been created previously and reused again. Patch 5 of
the current series track which negative dentry is newly created and
handle it differently. I can move this up the series and use that
information to decide if we should go to the slowpath.
>> Patch 1 tracks the number of negative dentries present in the LRU
>> lists and reports it in /proc/sys/fs/dentry-state.
> If anything I _think_ vmstat would benefit from this because behavior of
> the memory reclaim does depend on the amount of neg. dentries.
>
>> Patch 2 adds a "neg-dentry-pc" sysctl parameter that can be used to to
>> specify a soft limit on the number of negative allowed as a percentage
>> of total system memory. This parameter is 0 by default which means no
>> negative dentry limiting will be performed.
> percentage has turned out to be a really wrong unit for many tunables
> over time. Even 1% can be just too much on really large machines.
Yes, that is true. Do you have any suggestion of what kind of unit
should be used? I can scale down the unit to 0.1% of the system memory.
Alternatively, one unit can be 10k/cpu thread, so a 20-thread system
corresponds to 200k, etc.
>
>> Patch 3 enables automatic pruning of least recently used negative
>> dentries when the total number is close to the preset limit.
> Please explain why this cannot be done in a standard dcache shrinking
> way. I strongly suspect that you are developing yet another reclaim with
> its own sets of tunable and bypassing the existing infrastructure. I
> haven't read patches yet but the cover letter doesn't really explain
> design much so I am only guessing.
The standard dcache shrinking happens when the system is almost running
out of free memory. This new shrinker will be turned on when the number
of negative dentries is closed to the limit even when there are still
plenty of free memory left. It will stop when the number of negative
dentries is lowered to a safe level. The new shrinker is designed to
impose as little overhead to the currently running tasks. That is not
true for the standard shrinker which will have a rather significant
performance impact to the currently running tasks.
I can remove the new shrinker if people really don't want to add a new
one as long as I can keep the option to kill off newly created negative
dentries when the limit is exceeded.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-09 16:01   ` Waiman Long
@ 2018-07-10 14:27     ` Michal Hocko
  2018-07-10 16:09       ` Waiman Long
  0 siblings, 1 reply; 49+ messages in thread
From: Michal Hocko @ 2018-07-10 14:27 UTC (permalink / raw)
  To: Waiman Long
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On Mon 09-07-18 12:01:04, Waiman Long wrote:
> On 07/09/2018 04:19 AM, Michal Hocko wrote:
[...]
> > later needs a special treatment while the first one is ok? There are
> > quite some resources which allow a non privileged user to consume a lot
> > of memory and the memory controller is the only reliable way to mitigate
> > the risk.
> 
> Yes, memory controller is the only reliable way to mitigate the risk,
> but not all tasks are under the control of a memory controller with
> kernel memory limit.
But those which you do not trust should. So why do we need yet another
mechanism for the reclaim?
[...]
> >> Patch 1 tracks the number of negative dentries present in the LRU
> >> lists and reports it in /proc/sys/fs/dentry-state.
> > If anything I _think_ vmstat would benefit from this because behavior of
> > the memory reclaim does depend on the amount of neg. dentries.
> >
> >> Patch 2 adds a "neg-dentry-pc" sysctl parameter that can be used to to
> >> specify a soft limit on the number of negative allowed as a percentage
> >> of total system memory. This parameter is 0 by default which means no
> >> negative dentry limiting will be performed.
> > percentage has turned out to be a really wrong unit for many tunables
> > over time. Even 1% can be just too much on really large machines.
> 
> Yes, that is true. Do you have any suggestion of what kind of unit
> should be used? I can scale down the unit to 0.1% of the system memory.
> Alternatively, one unit can be 10k/cpu thread, so a 20-thread system
> corresponds to 200k, etc.
I simply think this is a strange user interface. How much is a
reasonable number? How can any admin figure that out?
> >> Patch 3 enables automatic pruning of least recently used negative
> >> dentries when the total number is close to the preset limit.
> > Please explain why this cannot be done in a standard dcache shrinking
> > way. I strongly suspect that you are developing yet another reclaim with
> > its own sets of tunable and bypassing the existing infrastructure. I
> > haven't read patches yet but the cover letter doesn't really explain
> > design much so I am only guessing.
> 
> The standard dcache shrinking happens when the system is almost running
> out of free memory.
Well, the standard reclaim happens when somebody needs memory. We are
usually quite far away from "almost running out of memory". We do
reclaim fs metadata including dentries so I really do not see why
negative ones should be any special here.
> This new shrinker will be turned on when the number
> of negative dentries is closed to the limit even when there are still
> plenty of free memory left. It will stop when the number of negative
> dentries is lowered to a safe level. The new shrinker is designed to
> impose as little overhead to the currently running tasks. That is not
> true for the standard shrinker which will have a rather significant
> performance impact to the currently running tasks.
Do you have any numbers to back your claim? The memory reclaim is
usually quite lightweight. Especially when we have a lot of clean
fs {meta}data
> I can remove the new shrinker if people really don't want to add a new
> one as long as I can keep the option to kill off newly created negative
> dentries when the limit is exceeded.
Please let's not add yet another memory reclaim mechanism. It will just
backfire sooner or later.
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-10 14:27     ` Michal Hocko
@ 2018-07-10 16:09       ` Waiman Long
  2018-07-11 10:21         ` Michal Hocko
  0 siblings, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-10 16:09 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On 07/10/2018 10:27 AM, Michal Hocko wrote:
> On Mon 09-07-18 12:01:04, Waiman Long wrote:
>> On 07/09/2018 04:19 AM, Michal Hocko wrote:
> [...]
>>> later needs a special treatment while the first one is ok? There are
>>> quite some resources which allow a non privileged user to consume a lot
>>> of memory and the memory controller is the only reliable way to mitigate
>>> the risk.
>> Yes, memory controller is the only reliable way to mitigate the risk,
>> but not all tasks are under the control of a memory controller with
>> kernel memory limit.
> But those which you do not trust should. So why do we need yet another
> mechanism for the reclaim?
Sometimes it could be a programming error in the code. I had seen a
customer report about the negative dentries because of a bug in their
code that generated a lot of negative dentries causing problem. In such
a controlled environment, they may not want to run their applications
under a memory cgroup as there is overhead involved in that. So a
mechanism to highlight and notify the problem is probably good to have.
>
> [...]
>>>> Patch 1 tracks the number of negative dentries present in the LRU
>>>> lists and reports it in /proc/sys/fs/dentry-state.
>>> If anything I _think_ vmstat would benefit from this because behavior of
>>> the memory reclaim does depend on the amount of neg. dentries.
>>>
>>>> Patch 2 adds a "neg-dentry-pc" sysctl parameter that can be used to to
>>>> specify a soft limit on the number of negative allowed as a percentage
>>>> of total system memory. This parameter is 0 by default which means no
>>>> negative dentry limiting will be performed.
>>> percentage has turned out to be a really wrong unit for many tunables
>>> over time. Even 1% can be just too much on really large machines.
>> Yes, that is true. Do you have any suggestion of what kind of unit
>> should be used? I can scale down the unit to 0.1% of the system memory.
>> Alternatively, one unit can be 10k/cpu thread, so a 20-thread system
>> corresponds to 200k, etc.
> I simply think this is a strange user interface. How much is a
> reasonable number? How can any admin figure that out?
Without the optional enforcement, the limit is essentially just a
notification mechanism where the system signals that there is something
wrong going on and the system administrator need to take a look. So it
is perfectly OK if the limit is sufficiently high that normally we won't
need to use that many negative dentries. The goal is to prevent negative
dentries from consuming a significant portion of the system memory.
I am going to reduce the granularity of each unit to 1/1000 of the total
system memory so that for large system with TB of memory, a smaller
amount of memory can be specified.
>>>> Patch 3 enables automatic pruning of least recently used negative
>>>> dentries when the total number is close to the preset limit.
>>> Please explain why this cannot be done in a standard dcache shrinking
>>> way. I strongly suspect that you are developing yet another reclaim with
>>> its own sets of tunable and bypassing the existing infrastructure. I
>>> haven't read patches yet but the cover letter doesn't really explain
>>> design much so I am only guessing.
>> The standard dcache shrinking happens when the system is almost running
>> out of free memory.
> Well, the standard reclaim happens when somebody needs memory. We are
> usually quite far away from "almost running out of memory". We do
> reclaim fs metadata including dentries so I really do not see why
> negative ones should be any special here.
That is fine. I can certainly live without the new reclaim mechanism.
>
>> This new shrinker will be turned on when the number
>> of negative dentries is closed to the limit even when there are still
>> plenty of free memory left. It will stop when the number of negative
>> dentries is lowered to a safe level. The new shrinker is designed to
>> impose as little overhead to the currently running tasks. That is not
>> true for the standard shrinker which will have a rather significant
>> performance impact to the currently running tasks.
> Do you have any numbers to back your claim? The memory reclaim is
> usually quite lightweight. Especially when we have a lot of clean
> fs {meta}data
In the case of dentries, it is the lock hold time of the LRU list that
can impact the normal filesystem operation. The new shrinker that I add
purposely limit the lock hold time whereas the standard shrinker can
hold the LRU for quite a long time if there are a lot of dentries to get
rid of. I have some performance numbers in the cover letter of this
patch about this.
>> I can remove the new shrinker if people really don't want to add a new
>> one as long as I can keep the option to kill off newly created negative
>> dentries when the limit is exceeded.
> Please let's not add yet another memory reclaim mechanism. It will just
> backfire sooner or later.
As said above, I am going to remove the new shrinker in the next version
of the patch. We can always add it back later on if we feel there is a
need to do it.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-10 16:09       ` Waiman Long
@ 2018-07-11 10:21         ` Michal Hocko
  2018-07-11 15:13           ` Waiman Long
  0 siblings, 1 reply; 49+ messages in thread
From: Michal Hocko @ 2018-07-11 10:21 UTC (permalink / raw)
  To: Waiman Long
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On Tue 10-07-18 12:09:17, Waiman Long wrote:
> On 07/10/2018 10:27 AM, Michal Hocko wrote:
> > On Mon 09-07-18 12:01:04, Waiman Long wrote:
> >> On 07/09/2018 04:19 AM, Michal Hocko wrote:
[...]
> >>> percentage has turned out to be a really wrong unit for many tunables
> >>> over time. Even 1% can be just too much on really large machines.
> >> Yes, that is true. Do you have any suggestion of what kind of unit
> >> should be used? I can scale down the unit to 0.1% of the system memory.
> >> Alternatively, one unit can be 10k/cpu thread, so a 20-thread system
> >> corresponds to 200k, etc.
> > I simply think this is a strange user interface. How much is a
> > reasonable number? How can any admin figure that out?
> 
> Without the optional enforcement, the limit is essentially just a
> notification mechanism where the system signals that there is something
> wrong going on and the system administrator need to take a look. So it
> is perfectly OK if the limit is sufficiently high that normally we won't
> need to use that many negative dentries. The goal is to prevent negative
> dentries from consuming a significant portion of the system memory.
So again. How do you tell the right number?
> I am going to reduce the granularity of each unit to 1/1000 of the total
> system memory so that for large system with TB of memory, a smaller
> amount of memory can be specified.
It is just a matter of time for this to be too coarse as well.
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-11 10:21         ` Michal Hocko
@ 2018-07-11 15:13           ` Waiman Long
  2018-07-11 17:42             ` James Bottomley
  2018-07-12  8:48             ` Michal Hocko
  0 siblings, 2 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-11 15:13 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On 07/11/2018 06:21 AM, Michal Hocko wrote:
> On Tue 10-07-18 12:09:17, Waiman Long wrote:
>> On 07/10/2018 10:27 AM, Michal Hocko wrote:
>>> On Mon 09-07-18 12:01:04, Waiman Long wrote:
>>>> On 07/09/2018 04:19 AM, Michal Hocko wrote:
> [...]
>>>>> percentage has turned out to be a really wrong unit for many tunables
>>>>> over time. Even 1% can be just too much on really large machines.
>>>> Yes, that is true. Do you have any suggestion of what kind of unit
>>>> should be used? I can scale down the unit to 0.1% of the system memory.
>>>> Alternatively, one unit can be 10k/cpu thread, so a 20-thread system
>>>> corresponds to 200k, etc.
>>> I simply think this is a strange user interface. How much is a
>>> reasonable number? How can any admin figure that out?
>> Without the optional enforcement, the limit is essentially just a
>> notification mechanism where the system signals that there is something
>> wrong going on and the system administrator need to take a look. So it
>> is perfectly OK if the limit is sufficiently high that normally we won't
>> need to use that many negative dentries. The goal is to prevent negative
>> dentries from consuming a significant portion of the system memory.
> So again. How do you tell the right number?
I guess it will be more a trial and error kind of adjustment as the
right figure will depend on the kind of workloads being run on the
system. So unless the enforcement option is turned on, setting a limit
that is too small won't have too much impact over than a slight
performance drop because of the invocation of the slowpaths and the
warning messages in the console. Whenever a non-zero value is written
into "neg-dentry-limit", an informational message will be printed about
what the actual negative dentry limits
will be. It can be compared against the current negative dentry number
(5th number) from "dentry-state" to see if there is enough safe margin
to avoid false positive warning.
>
>> I am going to reduce the granularity of each unit to 1/1000 of the total
>> system memory so that for large system with TB of memory, a smaller
>> amount of memory can be specified.
> It is just a matter of time for this to be too coarse as well.
The goal is to not have too much memory being consumed by negative
dentries and also the limit won't be reached by regular daily
activities. So a limit of 1/1000 of the total system memory will be good
enough on large memory system even if the absolute number is really big.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-11 15:13           ` Waiman Long
@ 2018-07-11 17:42             ` James Bottomley
  2018-07-11 19:07               ` Waiman Long
  2018-07-12  8:48             ` Michal Hocko
  1 sibling, 1 reply; 49+ messages in thread
From: James Bottomley @ 2018-07-11 17:42 UTC (permalink / raw)
  To: Waiman Long, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On Wed, 2018-07-11 at 11:13 -0400, Waiman Long wrote:
> On 07/11/2018 06:21 AM, Michal Hocko wrote:
> > On Tue 10-07-18 12:09:17, Waiman Long wrote:
[...]
> > > I am going to reduce the granularity of each unit to 1/1000 of
> > > the total system memory so that for large system with TB of
> > > memory, a smaller amount of memory can be specified.
> > 
> > It is just a matter of time for this to be too coarse as well.
> 
> The goal is to not have too much memory being consumed by negative
> dentries and also the limit won't be reached by regular daily
> activities. So a limit of 1/1000 of the total system memory will be
> good enough on large memory system even if the absolute number is
> really big.
OK, I think the reason we're going round and round here without
converging is that one of the goals of the mm subsystem is to manage
all of our cached objects and to it the negative (and positive)
dentries simply look like a clean cache of objects.  Right at the
moment mm manages them in the same way it manages all the other caches,
a lot of which suffer from the "you can cause lots of allocations to
artificially grow them" problem.  So the main question is why doesn't
the current mm control of the caches work well enough for dentries? 
What are the problems you're seeing that mm should be catching?  If you
can answer this, then we could get on to whether a separate shrinker,
cache separation or some fix in mm itself is the right answer.
What you say above is based on a conclusion: limiting dentries improves
the system performance.  What we're asking for is evidence for that
conclusion so we can explore whether the same would go for any of our
other system caches (so do we have a global cache management problem or
is it only the dentry cache?)
James
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-11 17:42             ` James Bottomley
@ 2018-07-11 19:07               ` Waiman Long
  2018-07-11 19:21                 ` James Bottomley
  0 siblings, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-11 19:07 UTC (permalink / raw)
  To: James Bottomley, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On 07/11/2018 01:42 PM, James Bottomley wrote:
> On Wed, 2018-07-11 at 11:13 -0400, Waiman Long wrote:
>> On 07/11/2018 06:21 AM, Michal Hocko wrote:
>>> On Tue 10-07-18 12:09:17, Waiman Long wrote:
> [...]
>>>> I am going to reduce the granularity of each unit to 1/1000 of
>>>> the total system memory so that for large system with TB of
>>>> memory, a smaller amount of memory can be specified.
>>> It is just a matter of time for this to be too coarse as well.
>> The goal is to not have too much memory being consumed by negative
>> dentries and also the limit won't be reached by regular daily
>> activities. So a limit of 1/1000 of the total system memory will be
>> good enough on large memory system even if the absolute number is
>> really big.
> OK, I think the reason we're going round and round here without
> converging is that one of the goals of the mm subsystem is to manage
> all of our cached objects and to it the negative (and positive)
> dentries simply look like a clean cache of objects.  Right at the
> moment mm manages them in the same way it manages all the other caches,
> a lot of which suffer from the "you can cause lots of allocations to
> artificially grow them" problem.  So the main question is why doesn't
> the current mm control of the caches work well enough for dentries? 
> What are the problems you're seeing that mm should be catching?  If you
> can answer this, then we could get on to whether a separate shrinker,
> cache separation or some fix in mm itself is the right answer.
>
> What you say above is based on a conclusion: limiting dentries improves
> the system performance.  What we're asking for is evidence for that
> conclusion so we can explore whether the same would go for any of our
> other system caches (so do we have a global cache management problem or
> is it only the dentry cache?)
>
> James
>
I am not saying that limiting dentries will improve performance. I am
just saying that unlimited growth in the number of negative dentries
will reduce the amount of memory available to other applications and
hence will have an impact on performance. Normally the amount of memory
consumed by dentries is a very small portion of the system memory.
Depending on memory size, it could be less than 1% or so. In such case,
doubling or even tripling the number of dentries probably won't have
much performance impact.
Unlike positive dentries which are constrained by the # of files in the
filesystems, the growth of negative dentries can be unlimited. A program
bug or a rogue application can easily generate a lot of negative
dentries consuming 10% or more system memory available if it is not
under the control of a memory controller limiting kernel memory.
The purpose of this patchset is to add a mechanism to track and
optionally limit the number of negative dentries that can be created in
a system. A new shrinker is added to round out the package, but it is
not an essential part of the patchset. The default memory shrinker will
be activated when the amount of free memory is low. I am going to drop
that in the next version of the patchset.
This patchset does change slightly the way dentries are handled in the
vfs layer. I will certainly welcome feedback as to whether those changes
are reasonable or not.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-11 19:07               ` Waiman Long
@ 2018-07-11 19:21                 ` James Bottomley
  2018-07-12 15:54                   ` Waiman Long
  0 siblings, 1 reply; 49+ messages in thread
From: James Bottomley @ 2018-07-11 19:21 UTC (permalink / raw)
  To: Waiman Long, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On Wed, 2018-07-11 at 15:07 -0400, Waiman Long wrote:
> On 07/11/2018 01:42 PM, James Bottomley wrote:
> > On Wed, 2018-07-11 at 11:13 -0400, Waiman Long wrote:
> > > On 07/11/2018 06:21 AM, Michal Hocko wrote:
> > > > On Tue 10-07-18 12:09:17, Waiman Long wrote:
> > 
> > [...]
> > > > > I am going to reduce the granularity of each unit to 1/1000
> > > > > of the total system memory so that for large system with TB
> > > > > of memory, a smaller amount of memory can be specified.
> > > > 
> > > > It is just a matter of time for this to be too coarse as well.
> > > 
> > > The goal is to not have too much memory being consumed by
> > > negative
> > > dentries and also the limit won't be reached by regular daily
> > > activities. So a limit of 1/1000 of the total system memory will
> > > be good enough on large memory system even if the absolute number
> > > is really big.
> > 
> > OK, I think the reason we're going round and round here without
> > converging is that one of the goals of the mm subsystem is to
> > manage all of our cached objects and to it the negative (and
> > positive) dentries simply look like a clean cache of
> > objects.  Right at the moment mm manages them in the same way it
> > manages all the other caches, a lot of which suffer from the "you
> > can cause lots of allocations to artificially grow them"
> > problem.  So the main question is why doesn't the current mm
> > control of the caches work well enough for dentries? 
> > What are the problems you're seeing that mm should be catching?  If
> > you can answer this, then we could get on to whether a separate
> > shrinker, cache separation or some fix in mm itself is the right
> > answer.
> > 
> > What you say above is based on a conclusion: limiting dentries
> > improves the system performance.  What we're asking for is evidence
> > for that conclusion so we can explore whether the same would go for
> > any of our other system caches (so do we have a global cache
> > management problem or is it only the dentry cache?)
> > 
> > James
> > 
> 
> I am not saying that limiting dentries will improve performance. I am
> just saying that unlimited growth in the number of negative dentries
> will reduce the amount of memory available to other applications and
> hence will have an impact on performance. Normally the amount of
> memory consumed by dentries is a very small portion of the system
> memory.
OK, can we poke on only this point for a while?  Linux never really has
any "available memory": pretty much as soon as you boot up the system
will consume all your free memory for some type of cache (usually the
page cache which got filled during boot).  The expectation is that in a
steady, running, state the system is using almost all available memory
for caching something ... if it's not negative dentries it will be
something else.  The mm assumption is that clean cache is so cheap to
recover that it's almost equivalent to free memory and your patch is
saying this isn't so and we have a problem dumping the dentry cache.
So, why can't we treat the dentry cache as equivalent to free memory? 
What in your tests is making it harder to recover the memory in the
dentry cache?
James
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-11 19:21                 ` James Bottomley
@ 2018-07-12 15:54                   ` Waiman Long
  2018-07-12 16:04                     ` James Bottomley
  0 siblings, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-12 15:54 UTC (permalink / raw)
  To: James Bottomley, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On 07/11/2018 03:21 PM, James Bottomley wrote:
> On Wed, 2018-07-11 at 15:07 -0400, Waiman Long wrote:
>> On 07/11/2018 01:42 PM, James Bottomley wrote:
>>> On Wed, 2018-07-11 at 11:13 -0400, Waiman Long wrote:
>>>> On 07/11/2018 06:21 AM, Michal Hocko wrote:
>>>>> On Tue 10-07-18 12:09:17, Waiman Long wrote:
>>> [...]
>>>>>> I am going to reduce the granularity of each unit to 1/1000
>>>>>> of the total system memory so that for large system with TB
>>>>>> of memory, a smaller amount of memory can be specified.
>>>>> It is just a matter of time for this to be too coarse as well.
>>>> The goal is to not have too much memory being consumed by
>>>> negative
>>>> dentries and also the limit won't be reached by regular daily
>>>> activities. So a limit of 1/1000 of the total system memory will
>>>> be good enough on large memory system even if the absolute number
>>>> is really big.
>>> OK, I think the reason we're going round and round here without
>>> converging is that one of the goals of the mm subsystem is to
>>> manage all of our cached objects and to it the negative (and
>>> positive) dentries simply look like a clean cache of
>>> objects.  Right at the moment mm manages them in the same way it
>>> manages all the other caches, a lot of which suffer from the "you
>>> can cause lots of allocations to artificially grow them"
>>> problem.  So the main question is why doesn't the current mm
>>> control of the caches work well enough for dentries? 
>>> What are the problems you're seeing that mm should be catching?  If
>>> you can answer this, then we could get on to whether a separate
>>> shrinker, cache separation or some fix in mm itself is the right
>>> answer.
>>>
>>> What you say above is based on a conclusion: limiting dentries
>>> improves the system performance.  What we're asking for is evidence
>>> for that conclusion so we can explore whether the same would go for
>>> any of our other system caches (so do we have a global cache
>>> management problem or is it only the dentry cache?)
>>>
>>> James
>>>
>> I am not saying that limiting dentries will improve performance. I am
>> just saying that unlimited growth in the number of negative dentries
>> will reduce the amount of memory available to other applications and
>> hence will have an impact on performance. Normally the amount of
>> memory consumed by dentries is a very small portion of the system
>> memory.
> OK, can we poke on only this point for a while?  Linux never really has
> any "available memory": pretty much as soon as you boot up the system
> will consume all your free memory for some type of cache (usually the
> page cache which got filled during boot).  The expectation is that in a
> steady, running, state the system is using almost all available memory
> for caching something ... if it's not negative dentries it will be
> something else.  The mm assumption is that clean cache is so cheap to
> recover that it's almost equivalent to free memory and your patch is
> saying this isn't so and we have a problem dumping the dentry cache.
>
> So, why can't we treat the dentry cache as equivalent to free memory? 
> What in your tests is making it harder to recover the memory in the
> dentry cache?
>
> James
It is not that dentry cache is harder to get rid of than the other
memory. It is that the ability of generate unlimited number of negative
dentries that will displace other useful memory from the system. What
the patch is trying to do is to have a warning or notification system in
place to spot unusual activities in regard to the number of negative
dentries in the system. The system administrators can then decide on
what to do next.
For many user activities, there are ways to audit what the users are
doing and what resources they are consuming. I don't think that is the
case for negative dentries. The closest I can think of is the use of
memory controller to limit the amount of kernel memory use. This
patchset will provide more visibility about the memory consumption of
negative dentries for the system as a whole, though it won't go into the
per-user level. We just don't want a disproportionate amount of memory
to be used up by negative dentries.
Cheers,
Longman
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 15:54                   ` Waiman Long
@ 2018-07-12 16:04                     ` James Bottomley
  2018-07-12 16:26                       ` Waiman Long
  2018-07-12 16:49                       ` Matthew Wilcox
  0 siblings, 2 replies; 49+ messages in thread
From: James Bottomley @ 2018-07-12 16:04 UTC (permalink / raw)
  To: Waiman Long, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On Thu, 2018-07-12 at 11:54 -0400, Waiman Long wrote:
> On 07/11/2018 03:21 PM, James Bottomley wrote:
> > On Wed, 2018-07-11 at 15:07 -0400, Waiman Long wrote:
> > > On 07/11/2018 01:42 PM, James Bottomley wrote:
> > > > On Wed, 2018-07-11 at 11:13 -0400, Waiman Long wrote:
> > > > > On 07/11/2018 06:21 AM, Michal Hocko wrote:
> > > > > > On Tue 10-07-18 12:09:17, Waiman Long wrote:
> > > > 
> > > > [...]
> > > > > > > I am going to reduce the granularity of each unit to
> > > > > > > 1/1000
> > > > > > > of the total system memory so that for large system with
> > > > > > > TB
> > > > > > > of memory, a smaller amount of memory can be specified.
> > > > > > 
> > > > > > It is just a matter of time for this to be too coarse as
> > > > > > well.
> > > > > 
> > > > > The goal is to not have too much memory being consumed by
> > > > > negative
> > > > > dentries and also the limit won't be reached by regular daily
> > > > > activities. So a limit of 1/1000 of the total system memory
> > > > > will
> > > > > be good enough on large memory system even if the absolute
> > > > > number
> > > > > is really big.
> > > > 
> > > > OK, I think the reason we're going round and round here without
> > > > converging is that one of the goals of the mm subsystem is to
> > > > manage all of our cached objects and to it the negative (and
> > > > positive) dentries simply look like a clean cache of
> > > > objects.  Right at the moment mm manages them in the same way
> > > > it
> > > > manages all the other caches, a lot of which suffer from the
> > > > "you
> > > > can cause lots of allocations to artificially grow them"
> > > > problem.  So the main question is why doesn't the current mm
> > > > control of the caches work well enough for dentries? 
> > > > What are the problems you're seeing that mm should be
> > > > catching?  If
> > > > you can answer this, then we could get on to whether a separate
> > > > shrinker, cache separation or some fix in mm itself is the
> > > > right
> > > > answer.
> > > > 
> > > > What you say above is based on a conclusion: limiting dentries
> > > > improves the system performance.  What we're asking for is
> > > > evidence
> > > > for that conclusion so we can explore whether the same would go
> > > > for
> > > > any of our other system caches (so do we have a global cache
> > > > management problem or is it only the dentry cache?)
> > > > 
> > > > James
> > > > 
> > > 
> > > I am not saying that limiting dentries will improve performance.
> > > I am
> > > just saying that unlimited growth in the number of negative
> > > dentries
> > > will reduce the amount of memory available to other applications
> > > and
> > > hence will have an impact on performance. Normally the amount of
> > > memory consumed by dentries is a very small portion of the system
> > > memory.
> > 
> > OK, can we poke on only this point for a while?  Linux never really
> > has
> > any "available memory": pretty much as soon as you boot up the
> > system
> > will consume all your free memory for some type of cache (usually
> > the
> > page cache which got filled during boot).  The expectation is that
> > in a
> > steady, running, state the system is using almost all available
> > memory
> > for caching something ... if it's not negative dentries it will be
> > something else.  The mm assumption is that clean cache is so cheap
> > to
> > recover that it's almost equivalent to free memory and your patch
> > is
> > saying this isn't so and we have a problem dumping the dentry
> > cache.
> > 
> > So, why can't we treat the dentry cache as equivalent to free
> > memory? 
> > What in your tests is making it harder to recover the memory in the
> > dentry cache?
> > 
> > James
> 
> It is not that dentry cache is harder to get rid of than the other
> memory. It is that the ability of generate unlimited number of
> negative dentries that will displace other useful memory from the
> system. What the patch is trying to do is to have a warning or
> notification system in place to spot unusual activities in regard to
> the number of negative dentries in the system. The system
> administrators can then decide on what to do next.
But every cache has this property: I can cause the same effect by doing
a streaming read on a multi gigabyte file: the page cache will fill
with the clean pages belonging to the file until I run out of memory
and it has to start evicting older cache entries.  Once we hit the
steady state of minimal free memory, the mm subsytem tries to balance
the cache requests (like my streaming read) against the existing pool
of cached objects.
The question I'm trying to get an answer to is why does the dentry
cache need special limits when the mm handling of the page cache (and
other mm caches) just works?
James
> For many user activities, there are ways to audit what the users are
> doing and what resources they are consuming. I don't think that is
> the case for negative dentries. The closest I can think of is the use
> of memory controller to limit the amount of kernel memory use. This
> patchset will provide more visibility about the memory consumption of
> negative dentries for the system as a whole, though it won't go into
> the per-user level. We just don't want a disproportionate amount of
> memory to be used up by negative dentries.
> 
> Cheers,
> Longman
> 
> Cheers,
> Longman
> 
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 16:04                     ` James Bottomley
@ 2018-07-12 16:26                       ` Waiman Long
  2018-07-12 17:33                         ` James Bottomley
  2018-07-12 16:49                       ` Matthew Wilcox
  1 sibling, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-12 16:26 UTC (permalink / raw)
  To: James Bottomley, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On 07/12/2018 12:04 PM, James Bottomley wrote:
> On Thu, 2018-07-12 at 11:54 -0400, Waiman Long wrote:
>>
>> It is not that dentry cache is harder to get rid of than the other
>> memory. It is that the ability of generate unlimited number of
>> negative dentries that will displace other useful memory from the
>> system. What the patch is trying to do is to have a warning or
>> notification system in place to spot unusual activities in regard to
>> the number of negative dentries in the system. The system
>> administrators can then decide on what to do next.
> But every cache has this property: I can cause the same effect by doing
> a streaming read on a multi gigabyte file: the page cache will fill
> with the clean pages belonging to the file until I run out of memory
> and it has to start evicting older cache entries.  Once we hit the
> steady state of minimal free memory, the mm subsytem tries to balance
> the cache requests (like my streaming read) against the existing pool
> of cached objects.
>
> The question I'm trying to get an answer to is why does the dentry
> cache need special limits when the mm handling of the page cache (and
> other mm caches) just works?
>
> James
>
I/O activities can be easily tracked. Generation of negative dentries,
however, is more insidious. So the ability to track and be notified when
too many negative dentries are created can be a useful tool for the
system administrators. Besides, there are paranoid users out there who
want to have control of as much as system parameters as possible.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 16:26                       ` Waiman Long
@ 2018-07-12 17:33                         ` James Bottomley
  2018-07-13 15:32                           ` Waiman Long
  0 siblings, 1 reply; 49+ messages in thread
From: James Bottomley @ 2018-07-12 17:33 UTC (permalink / raw)
  To: Waiman Long, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On Thu, 2018-07-12 at 12:26 -0400, Waiman Long wrote:
> On 07/12/2018 12:04 PM, James Bottomley wrote:
> > On Thu, 2018-07-12 at 11:54 -0400, Waiman Long wrote:
> > > 
> > > It is not that dentry cache is harder to get rid of than the
> > > other memory. It is that the ability of generate unlimited number
> > > of negative dentries that will displace other useful memory from
> > > the system. What the patch is trying to do is to have a warning
> > > or notification system in place to spot unusual activities in
> > > regard to the number of negative dentries in the system. The
> > > system administrators can then decide on what to do next.
> > 
> > But every cache has this property: I can cause the same effect by
> > doing a streaming read on a multi gigabyte file: the page cache
> > will fill with the clean pages belonging to the file until I run
> > out of memory and it has to start evicting older cache
> > entries.  Once we hit the steady state of minimal free memory, the
> > mm subsytem tries to balance the cache requests (like my streaming
> > read) against the existing pool of cached objects.
> > 
> > The question I'm trying to get an answer to is why does the dentry
> > cache need special limits when the mm handling of the page cache
> > (and other mm caches) just works?
> > 
> > James
> > 
> 
> I/O activities can be easily tracked.
Tracked?  What do you mean tracked?  you mean we can control the page
cache through userfaultfd or something without resorting to cgroup
limits or something different?  I mean all caches are "tracked" because
otherwise we wouldn't know whether we have to retrieve/generate the
object or pull it from the cache.  If it's just about cache state,
what's wrong with /proc/sys/fs/dentry-state?
>  Generation of negative dentries, however, is more insidious. So the
> ability to track and be notified when too many negative dentries are
> created can be a useful tool for the system administrators. Besides,
> there are paranoid users out there who want to have control of as
> much as system parameters as possible.
To what end?  what problem are these administrators trying to solve? 
You keep coming back to the circular argument that the problem they're
trying to solve is limiting negative dentries, but I want to know what
issue they see in their systems that causes them to ask for this knob.
James
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 17:33                         ` James Bottomley
@ 2018-07-13 15:32                           ` Waiman Long
  0 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-13 15:32 UTC (permalink / raw)
  To: James Bottomley, Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, Wangkai (Kevin C)
On 07/12/2018 01:33 PM, James Bottomley wrote:
> On Thu, 2018-07-12 at 12:26 -0400, Waiman Long wrote:
>> On 07/12/2018 12:04 PM, James Bottomley wrote:
>>> On Thu, 2018-07-12 at 11:54 -0400, Waiman Long wrote:
>>>> It is not that dentry cache is harder to get rid of than the
>>>> other memory. It is that the ability of generate unlimited number
>>>> of negative dentries that will displace other useful memory from
>>>> the system. What the patch is trying to do is to have a warning
>>>> or notification system in place to spot unusual activities in
>>>> regard to the number of negative dentries in the system. The
>>>> system administrators can then decide on what to do next.
>>> But every cache has this property: I can cause the same effect by
>>> doing a streaming read on a multi gigabyte file: the page cache
>>> will fill with the clean pages belonging to the file until I run
>>> out of memory and it has to start evicting older cache
>>> entries.  Once we hit the steady state of minimal free memory, the
>>> mm subsytem tries to balance the cache requests (like my streaming
>>> read) against the existing pool of cached objects.
>>>
>>> The question I'm trying to get an answer to is why does the dentry
>>> cache need special limits when the mm handling of the page cache
>>> (and other mm caches) just works?
>>>
>>> James
>>>
>> I/O activities can be easily tracked.
> Tracked?  What do you mean tracked?  you mean we can control the page
> cache through userfaultfd or something without resorting to cgroup
> limits or something different?  I mean all caches are "tracked" because
> otherwise we wouldn't know whether we have to retrieve/generate the
> object or pull it from the cache.  If it's just about cache state,
> what's wrong with /proc/sys/fs/dentry-state?
Sorry for being imprecise. What I meant is it is easy to find out which
tasks issue the most I/O request and consume the most I/O bandwidth.
IOW, which one we can blame if there are too much I/O activities. On the
other hand, it is not that easy to find out which task generates the
most negative dentries.
>>  Generation of negative dentries, however, is more insidious. So the
>> ability to track and be notified when too many negative dentries are
>> created can be a useful tool for the system administrators. Besides,
>> there are paranoid users out there who want to have control of as
>> much as system parameters as possible.
> To what end?  what problem are these administrators trying to solve? 
> You keep coming back to the circular argument that the problem they're
> trying to solve is limiting negative dentries, but I want to know what
> issue they see in their systems that causes them to ask for this knob.
>
> James
>
I would say most system administrators don't want to have surprise. They
don't want to see bad performance or other system problems and after
some digging find out that some tasks are generating too much negative
dentries and thus consuming too much memory, for example. They would
certainly like to a way to notify them before the problems happen.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
 
 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 16:04                     ` James Bottomley
  2018-07-12 16:26                       ` Waiman Long
@ 2018-07-12 16:49                       ` Matthew Wilcox
  2018-07-12 17:21                         ` James Bottomley
  1 sibling, 1 reply; 49+ messages in thread
From: Matthew Wilcox @ 2018-07-12 16:49 UTC (permalink / raw)
  To: James Bottomley
  Cc: Waiman Long, Michal Hocko, Alexander Viro, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, linux-kernel, linux-fsdevel,
	linux-mm, linux-doc, Linus Torvalds, Jan Kara, Paul E. McKenney,
	Andrew Morton, Ingo Molnar, Miklos Szeredi, Larry Woodman,
	Wangkai (Kevin C)
On Thu, Jul 12, 2018 at 09:04:54AM -0700, James Bottomley wrote:
> On Thu, 2018-07-12 at 11:54 -0400, Waiman Long wrote:
> > It is not that dentry cache is harder to get rid of than the other
> > memory. It is that the ability of generate unlimited number of
> > negative dentries that will displace other useful memory from the
> > system. What the patch is trying to do is to have a warning or
> > notification system in place to spot unusual activities in regard to
> > the number of negative dentries in the system. The system
> > administrators can then decide on what to do next.
> 
> But every cache has this property: I can cause the same effect by doing
> a streaming read on a multi gigabyte file: the page cache will fill
> with the clean pages belonging to the file until I run out of memory
> and it has to start evicting older cache entries.  Once we hit the
> steady state of minimal free memory, the mm subsytem tries to balance
> the cache requests (like my streaming read) against the existing pool
> of cached objects.
> 
> The question I'm trying to get an answer to is why does the dentry
> cache need special limits when the mm handling of the page cache (and
> other mm caches) just works?
I don't know that it does work.  Or that it works well.
When we try to allocate something and there's no memory readily available,
we ask all the shrinkers to shrink in order to free up memory.  That leads
to one kind of allocation (eg dentries) being able to easily kick all
the page cache out of the machine.
What we could do instead is first call the shrinker for the type of
object being allocated.  That is, assume the system is more or less in
equilibrium between all the different things it could be allocating,
and if something needs to be kicked out, it's better to kick out this
kind of thing rather than changing the equilibrium.
Of course, workloads change over time, and sometimes we should accept we
need more dentries and less page cache, or vice versa.  So we'd need a
scheme for the shrinker to say "this is getting hard, I think we do need
more dentries", and then we'd move on to calling the other shrinkers to
reclaim inodes or page cache or whatever.
I don't think we even try to work at this level today.  But it would
have the distinct advantage that we can implement this in the slab/slub
code rather than touching the page allocator.
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 16:49                       ` Matthew Wilcox
@ 2018-07-12 17:21                         ` James Bottomley
  2018-07-12 18:06                           ` Linus Torvalds
  0 siblings, 1 reply; 49+ messages in thread
From: James Bottomley @ 2018-07-12 17:21 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Waiman Long, Michal Hocko, Alexander Viro, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, linux-kernel, linux-fsdevel,
	linux-mm, linux-doc, Linus Torvalds, Jan Kara, Paul E. McKenney,
	Andrew Morton, Ingo Molnar, Miklos Szeredi, Larry Woodman,
	Wangkai (Kevin C)
On Thu, 2018-07-12 at 09:49 -0700, Matthew Wilcox wrote:
> On Thu, Jul 12, 2018 at 09:04:54AM -0700, James Bottomley wrote:
[...]
> > The question I'm trying to get an answer to is why does the dentry
> > cache need special limits when the mm handling of the page cache
> > (and other mm caches) just works?
> 
> I don't know that it does work.  Or that it works well.
I'm not claiming the general heuristics are perfect (in fact I know we
still have a lot of problems with dirty reclaim and writeback).  I am
willing to bet that any discussion of the heuristics will get a lot of
opposition if we try to introduce per-object limits for every object.
Our clean cache heuristics are simple: clean caches are easy to reclaim
and are thus treated like free memory (there's little cost to filling
them or reclaiming them again).  There is speculation that this
equivalence is problematic because the shrinkers reclaim objects but mm
is looking to reclaim pages and thus you can end up with a few objects
pinning many pages even if the shrinker freed a lot of them.
However, we haven't even reached that level yet ... I'm still
struggling to establish that we have a problem with the behaviour of
the dentry cache under current mm heuristics.  
James
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 17:21                         ` James Bottomley
@ 2018-07-12 18:06                           ` Linus Torvalds
  2018-07-12 19:57                             ` James Bottomley
  0 siblings, 1 reply; 49+ messages in thread
From: Linus Torvalds @ 2018-07-12 18:06 UTC (permalink / raw)
  To: James Bottomley
  Cc: Matthew Wilcox, Waiman Long, Michal Hocko, Al Viro,
	Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
[-- Attachment #1: Type: text/plain, Size: 798 bytes --]
On Thu, Jul 12, 2018 at 10:21 AM James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
>
> On Thu, 2018-07-12 at 09:49 -0700, Matthew Wilcox wrote:
> >
> > I don't know that it does work.  Or that it works well.
>
> I'm not claiming the general heuristics are perfect (in fact I know we
> still have a lot of problems with dirty reclaim and writeback).
I think this whole "this is about running out of memory" approach is wrong.
We *should* handle that well. Or well enough in practice, at least.
Do we? Maybe not. Should the dcache be the one area to be policed and
worked around? Probably not.
But there may be other reasons to just limit negative dentries.
What does the attached program do to people? It's written to be
intentionally annoying to the dcache.
               Linus
[-- Attachment #2: t.c --]
[-- Type: text/x-csrc, Size: 617 bytes --]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
static void die(const char *msg)
{
	fputs(msg, stderr);
	exit(1);
}
/*
 * Use a "longish" filename to make more trouble for the dcache.
 *
 * The inline length is 32-40 bytes depending on kernel config,
 * so make it larger than that.
 */
int main(void)
{
	int i;
	char buffer[64];
	memset(buffer, 'a', sizeof(buffer));
	buffer[63] = 0;
	for (i = 0; i < 100000000; i++) {
		snprintf(buffer+40, sizeof(buffer)-40, "-%08d", i);
		if (open(buffer, O_RDONLY) >= 0)
			die("You're missing the point\n");
	}
	return 0;
}
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 18:06                           ` Linus Torvalds
@ 2018-07-12 19:57                             ` James Bottomley
  2018-07-13  0:36                               ` Dave Chinner
  0 siblings, 1 reply; 49+ messages in thread
From: James Bottomley @ 2018-07-12 19:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Matthew Wilcox, Waiman Long, Michal Hocko, Al Viro,
	Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Thu, 2018-07-12 at 11:06 -0700, Linus Torvalds wrote:
> On Thu, Jul 12, 2018 at 10:21 AM James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > 
> > On Thu, 2018-07-12 at 09:49 -0700, Matthew Wilcox wrote:
> > > 
> > > I don't know that it does work.  Or that it works well.
> > 
> > I'm not claiming the general heuristics are perfect (in fact I know
> > we
> > still have a lot of problems with dirty reclaim and writeback).
> 
> I think this whole "this is about running out of memory" approach is
> wrong.
> 
> We *should* handle that well. Or well enough in practice, at least.
> 
> Do we? Maybe not. Should the dcache be the one area to be policed and
> worked around? Probably not.
> 
> But there may be other reasons to just limit negative dentries.
> 
> What does the attached program do to people? It's written to be
> intentionally annoying to the dcache.
So it's interesting.  What happens for me is that I start out at pretty
much no free memory so the programme slowly starts to fill up my
available swap without shrinking my page cache (presumably it's causing
dirty anonymous objects to be pushed out) and the dcache grows a bit.
Then when my free swap reaches 0 we start to reclaim the dcache and it
shrinks again (apparently still keeping the page cache at around 1.8G).
 The system seems perfectly usable while this is running (tried browser
and a couple of compiles) ... any calls for free memory seem to come
out of the enormous but easily reclaimable dcache.
The swap effect is unexpected, but everything else seems to be going
according to how I would wish.
When I kill the programme I get about a megabyte of swap back but it's
staying with the rest of swap occupied.  When all this started I had an
8G laptop with 2G of swap of which 1G was used.  Now I have 2G of swap
used but it all seems to be running OK.
So what I mean by dcache grows a bit is this:
I missed checking it before I started, but it eventually grew to
jejb@jarvis:~> cat /proc/sys/fs/dentry-state 
2841534	2816297	45	0	0	0
Before eventually going back after I killed the programme to
jejb@jarvis:~> cat /proc/sys/fs/dentry-state 
806559	781138	45	0	0	0
I just tried it again and this time the dcache only peaked at 
jejb@jarvis:~> cat /proc/sys/fs/dentry-state 
2321933	2296607	45	0	0	0
What surprises me most about this behaviour is the steadiness of the
page cache ... I would have thought we'd have shrunk it somewhat given
the intense call on the dcache.
James
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 19:57                             ` James Bottomley
@ 2018-07-13  0:36                               ` Dave Chinner
  2018-07-13 15:46                                 ` James Bottomley
  2018-07-16  9:09                                 ` Michal Hocko
  0 siblings, 2 replies; 49+ messages in thread
From: Dave Chinner @ 2018-07-13  0:36 UTC (permalink / raw)
  To: James Bottomley
  Cc: Linus Torvalds, Matthew Wilcox, Waiman Long, Michal Hocko,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Thu, Jul 12, 2018 at 12:57:15PM -0700, James Bottomley wrote:
> What surprises me most about this behaviour is the steadiness of the
> page cache ... I would have thought we'd have shrunk it somewhat given
> the intense call on the dcache.
Oh, good, the page cache vs superblock shrinker balancing still
protects the working set of each cache the way it's supposed to
under heavy single cache pressure. :)
Keep in mind that the amount of work slab cache shrinkers perform is
directly proportional to the amount of page cache reclaim that is
performed and the size of the slab cache being reclaimed.  IOWs,
under a "single cache pressure" workload we should be directing
reclaim work to the huge cache creating the pressure and do very
little reclaim from other caches....
[ What follows from here is conjecture, but is based on what I've
seen in the past 10+ years on systems with large numbers of negative
dentries and fragmented dentry/inode caches. ]
However, this only reaches steady state if the reclaim rate can keep
ahead of the allocation rate. This single threaded micro-workload
won't result in an internally fragmented dentry slab cache, so
reclaim is going to be as efficient as possible and have the CPU to
keep up with the allocation rate.  i.e. Bulk negative dentry reclaim
is cheap, in LRU order, and frees slab pages quickly and efficiently
in large batches so steady state is easily reached.
Problems arise when the slab *page* reclaim rate drops below
allocation rate. i.e when you have short term (negative) dentries
mixed into the same slab pages as long term stable dentries. This
causes the dentry cache to fragment internally - reclaim hits the
negative dentries and creates large numbers of partial pages - and
so reclaim of negative dentries will fail to free memory. Creating
new negative dentries then fills these partial pages first, and so
the alloc/reclaim cycles on negative dentries only ever produce
partial pages and never free slab cache pages. IOWs, the cost of
reclaim slab *pages* goes way up despite the fact that the cost of
reclaiming individual dentries has remained the same.
That's the underlying problem here - the cost of reclaiming dentries
is constant but the cost of reclaiming *slab pages* is not.  It is
not uncommon to have to trash 90% of the dentry or inode caches to
reduce internal fragmentation down to the point where pages start to
get freed and the per-slab-page reclaim cost reduces to be less than
the allocation cost. Then we see the system return to normal steady
state behaviour.
In situations where lots of negative dentries are created by
production workloads, that "90%" of the cache that needs to be
reclaimed to fix the internal fragmentation issue is all negative
dentries and just enough of the real dentries to be freeing
quantities of partial pages in the slab. Hence negative dentries are
seen as the problem because they make up the vast majority of the
dentries that get reclaimed when the problem goes away.
By limiting the number of negative dentries in this case, internal
slab fragmentation is reduced such that reclaim cost never gets out
of control. While it appears to "fix" the symptoms, it doesn't
address the underlying problem. It is a partial solution at best but
at worst it's another opaque knob that nobody knows how or when to
tune.
Very few microbenchmarks expose this internal slab fragmentation
problem because they either don't run long enough, don't create
memory pressure, or don't have access patterns that mix long and
short term slab objects together in a way that causes slab
fragmentation. Run some cold cache directory traversals (git
status?) at the same time you are creating negative dentries so you
create pinned partial pages in the slab cache and see how the
behaviour changes....
Cheers,
Dave.
-- 
Dave Chinner
david@fromorbit.com
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-13  0:36                               ` Dave Chinner
@ 2018-07-13 15:46                                 ` James Bottomley
  2018-07-13 23:17                                   ` Dave Chinner
  2018-07-16  9:10                                   ` Michal Hocko
  2018-07-16  9:09                                 ` Michal Hocko
  1 sibling, 2 replies; 49+ messages in thread
From: James Bottomley @ 2018-07-13 15:46 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Linus Torvalds, Matthew Wilcox, Waiman Long, Michal Hocko,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Fri, 2018-07-13 at 10:36 +1000, Dave Chinner wrote:
> On Thu, Jul 12, 2018 at 12:57:15PM -0700, James Bottomley wrote:
> > What surprises me most about this behaviour is the steadiness of
> > the page cache ... I would have thought we'd have shrunk it
> > somewhat given the intense call on the dcache.
> 
> Oh, good, the page cache vs superblock shrinker balancing still
> protects the working set of each cache the way it's supposed to
> under heavy single cache pressure. :)
Well, yes, but my expectation is most of the page cache is clean, so
easily reclaimable.  I suppose part of my surprise is that I expected
us to reclaim the clean caches first before we started pushing out the
dirty stuff and reclaiming it.  I'm not saying it's a bad thing, just
saying I didn't expect us to make such good decisions under the
parameters of this test.
> Keep in mind that the amount of work slab cache shrinkers perform is
> directly proportional to the amount of page cache reclaim that is
> performed and the size of the slab cache being reclaimed.  IOWs,
> under a "single cache pressure" workload we should be directing
> reclaim work to the huge cache creating the pressure and do very
> little reclaim from other caches....
That definitely seems to happen.  The thing I was most surprised about
is the steady pushing of anonymous objects to swap.  I agree the dentry
cache doesn't seem to be growing hugely after the initial jump, so it
seems to be the largest source of reclaim.
> [ What follows from here is conjecture, but is based on what I've
> seen in the past 10+ years on systems with large numbers of negative
> dentries and fragmented dentry/inode caches. ]
OK, so I fully agree with the concern about pathological object vs page
freeing problems (I referred to it previously).  However, I did think
the compaction work that's been ongoing in mm was supposed to help
here?
James
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-13 15:46                                 ` James Bottomley
@ 2018-07-13 23:17                                   ` Dave Chinner
  2018-07-16  9:10                                   ` Michal Hocko
  1 sibling, 0 replies; 49+ messages in thread
From: Dave Chinner @ 2018-07-13 23:17 UTC (permalink / raw)
  To: James Bottomley
  Cc: Linus Torvalds, Matthew Wilcox, Waiman Long, Michal Hocko,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Fri, Jul 13, 2018 at 08:46:52AM -0700, James Bottomley wrote:
> On Fri, 2018-07-13 at 10:36 +1000, Dave Chinner wrote:
> > On Thu, Jul 12, 2018 at 12:57:15PM -0700, James Bottomley wrote:
> > > What surprises me most about this behaviour is the steadiness of
> > > the page cache ... I would have thought we'd have shrunk it
> > > somewhat given the intense call on the dcache.
> > 
> > Oh, good, the page cache vs superblock shrinker balancing still
> > protects the working set of each cache the way it's supposed to
> > under heavy single cache pressure. :)
> 
> Well, yes, but my expectation is most of the page cache is clean, so
> easily reclaimable.  I suppose part of my surprise is that I expected
> us to reclaim the clean caches first before we started pushing out the
> dirty stuff and reclaiming it.  I'm not saying it's a bad thing, just
> saying I didn't expect us to make such good decisions under the
> parameters of this test.
The clean caches are still turned over by the workload, but it is
very slow and only enough to eject old objects that have fallen out
of the working set. We've got a lot better at keeping the working
set in memory in adverse conditions over the past few years...
> > Keep in mind that the amount of work slab cache shrinkers perform is
> > directly proportional to the amount of page cache reclaim that is
> > performed and the size of the slab cache being reclaimed.��IOWs,
> > under a "single cache pressure" workload we should be directing
> > reclaim work to the huge cache creating the pressure and do very
> > little reclaim from other caches....
> 
> That definitely seems to happen.  The thing I was most surprised about
> is the steady pushing of anonymous objects to swap.  I agree the dentry
> cache doesn't seem to be growing hugely after the initial jump, so it
> seems to be the largest source of reclaim.
Which means swap behaviour has changed since I last looked at
reclaim balance several years ago. These sorts of dentry/inode loads
never used to push the system to swap. Not saying it's a bad thing,
just that it is different. :)
> > [ What follows from here is conjecture, but is based on what I've
> > seen in the past 10+ years on systems with large numbers of negative
> > dentries and fragmented dentry/inode caches. ]
> 
> OK, so I fully agree with the concern about pathological object vs page
> freeing problems (I referred to it previously).  However, I did think
> the compaction work that's been ongoing in mm was supposed to help
> here?
Compaction doesn't touch slab caches. We can't move active dentries
and other slab objects around in memory because they have external
objects with active references that point directly to them. Getting
exclusive access to active objects and all the things that point to
them from reclaim so we can move them is an intractable problem - it
has sunk slab cache defragmentation every time it has been attempted
in the past 15 years....
Cheers,
Dave.
-- 
Dave Chinner
david@fromorbit.com
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-13 15:46                                 ` James Bottomley
  2018-07-13 23:17                                   ` Dave Chinner
@ 2018-07-16  9:10                                   ` Michal Hocko
  2018-07-16 14:42                                     ` James Bottomley
  1 sibling, 1 reply; 49+ messages in thread
From: Michal Hocko @ 2018-07-16  9:10 UTC (permalink / raw)
  To: James Bottomley
  Cc: Dave Chinner, Linus Torvalds, Matthew Wilcox, Waiman Long,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Fri 13-07-18 08:46:52, James Bottomley wrote:
> On Fri, 2018-07-13 at 10:36 +1000, Dave Chinner wrote:
> > On Thu, Jul 12, 2018 at 12:57:15PM -0700, James Bottomley wrote:
> > > What surprises me most about this behaviour is the steadiness of
> > > the page cache ... I would have thought we'd have shrunk it
> > > somewhat given the intense call on the dcache.
> > 
> > Oh, good, the page cache vs superblock shrinker balancing still
> > protects the working set of each cache the way it's supposed to
> > under heavy single cache pressure. :)
> 
> Well, yes, but my expectation is most of the page cache is clean, so
> easily reclaimable.  I suppose part of my surprise is that I expected
> us to reclaim the clean caches first before we started pushing out the
> dirty stuff and reclaiming it.  I'm not saying it's a bad thing, just
> saying I didn't expect us to make such good decisions under the
> parameters of this test.
This is indeed unepxected. Especially when the current LRU reclaim balancing
logic is highly pagecache biased. Are you sure you were not running in a
memcg with a small amount of the pagecache?
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16  9:10                                   ` Michal Hocko
@ 2018-07-16 14:42                                     ` James Bottomley
  0 siblings, 0 replies; 49+ messages in thread
From: James Bottomley @ 2018-07-16 14:42 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Dave Chinner, Linus Torvalds, Matthew Wilcox, Waiman Long,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Mon, 2018-07-16 at 11:10 +0200, Michal Hocko wrote:
> On Fri 13-07-18 08:46:52, James Bottomley wrote:
> > On Fri, 2018-07-13 at 10:36 +1000, Dave Chinner wrote:
> > > On Thu, Jul 12, 2018 at 12:57:15PM -0700, James Bottomley wrote:
> > > > What surprises me most about this behaviour is the steadiness
> > > > of the page cache ... I would have thought we'd have shrunk it
> > > > somewhat given the intense call on the dcache.
> > > 
> > > Oh, good, the page cache vs superblock shrinker balancing still
> > > protects the working set of each cache the way it's supposed to
> > > under heavy single cache pressure. :)
> > 
> > Well, yes, but my expectation is most of the page cache is clean,
> > so easily reclaimable.  I suppose part of my surprise is that I
> > expected us to reclaim the clean caches first before we started
> > pushing out the dirty stuff and reclaiming it.  I'm not saying it's
> > a bad thing, just saying I didn't expect us to make such good
> > decisions under the parameters of this test.
> 
> This is indeed unepxected. Especially when the current LRU reclaim
> balancing logic is highly pagecache biased. Are you sure you were not
> running in a memcg with a small amount of the pagecache?
Yes, absolutely: I just compiled and ran the programme on my laptop
with no type of containment (I trust Linus, right ...)
To be clear, the dirty anon push out was quite slow, so I don't think
mm was using it as a serious source of reclaim, it was probably just
being caught up in some other page clearing process.
James
^ permalink raw reply	[flat|nested] 49+ messages in thread 
 
 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-13  0:36                               ` Dave Chinner
  2018-07-13 15:46                                 ` James Bottomley
@ 2018-07-16  9:09                                 ` Michal Hocko
  2018-07-16  9:12                                   ` Michal Hocko
                                                     ` (2 more replies)
  1 sibling, 3 replies; 49+ messages in thread
From: Michal Hocko @ 2018-07-16  9:09 UTC (permalink / raw)
  To: Dave Chinner
  Cc: James Bottomley, Linus Torvalds, Matthew Wilcox, Waiman Long,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Fri 13-07-18 10:36:14, Dave Chinner wrote:
[...]
> By limiting the number of negative dentries in this case, internal
> slab fragmentation is reduced such that reclaim cost never gets out
> of control. While it appears to "fix" the symptoms, it doesn't
> address the underlying problem. It is a partial solution at best but
> at worst it's another opaque knob that nobody knows how or when to
> tune.
Would it help to put all the negative dentries into its own slab cache?
> Very few microbenchmarks expose this internal slab fragmentation
> problem because they either don't run long enough, don't create
> memory pressure, or don't have access patterns that mix long and
> short term slab objects together in a way that causes slab
> fragmentation. Run some cold cache directory traversals (git
> status?) at the same time you are creating negative dentries so you
> create pinned partial pages in the slab cache and see how the
> behaviour changes....
Agreed! Slab fragmentation is a real problem we are seeing for quite
some time. We should try to address it rather than paper over it with
weird knobs.
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16  9:09                                 ` Michal Hocko
@ 2018-07-16  9:12                                   ` Michal Hocko
  2018-07-16 12:41                                   ` Matthew Wilcox
  2018-07-18 16:17                                   ` Waiman Long
  2 siblings, 0 replies; 49+ messages in thread
From: Michal Hocko @ 2018-07-16  9:12 UTC (permalink / raw)
  To: Dave Chinner
  Cc: James Bottomley, Linus Torvalds, Matthew Wilcox, Waiman Long,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Mon 16-07-18 11:09:01, Michal Hocko wrote:
> On Fri 13-07-18 10:36:14, Dave Chinner wrote:
> [...]
> > By limiting the number of negative dentries in this case, internal
> > slab fragmentation is reduced such that reclaim cost never gets out
> > of control. While it appears to "fix" the symptoms, it doesn't
> > address the underlying problem. It is a partial solution at best but
> > at worst it's another opaque knob that nobody knows how or when to
> > tune.
> 
> Would it help to put all the negative dentries into its own slab cache?
We couldn't http://lkml.kernel.org/r/20180714173516.uumlhs4wgfgrlc32@devuan
Sorry I haven't noticed the other line of discussion
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16  9:09                                 ` Michal Hocko
  2018-07-16  9:12                                   ` Michal Hocko
@ 2018-07-16 12:41                                   ` Matthew Wilcox
  2018-07-16 23:40                                     ` Andrew Morton
  2018-07-18 16:17                                   ` Waiman Long
  2 siblings, 1 reply; 49+ messages in thread
From: Matthew Wilcox @ 2018-07-16 12:41 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Dave Chinner, James Bottomley, Linus Torvalds, Waiman Long,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Mon, Jul 16, 2018 at 11:09:01AM +0200, Michal Hocko wrote:
> On Fri 13-07-18 10:36:14, Dave Chinner wrote:
> [...]
> > By limiting the number of negative dentries in this case, internal
> > slab fragmentation is reduced such that reclaim cost never gets out
> > of control. While it appears to "fix" the symptoms, it doesn't
> > address the underlying problem. It is a partial solution at best but
> > at worst it's another opaque knob that nobody knows how or when to
> > tune.
> 
> Would it help to put all the negative dentries into its own slab cache?
Maybe the dcache should be more sensitive to its own needs.  In __d_alloc,
it could check whether there are a high proportion of negative dentries
and start recycling some existing negative dentries.
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16 12:41                                   ` Matthew Wilcox
@ 2018-07-16 23:40                                     ` Andrew Morton
  2018-07-17  1:30                                       ` Matthew Wilcox
                                                         ` (2 more replies)
  0 siblings, 3 replies; 49+ messages in thread
From: Andrew Morton @ 2018-07-16 23:40 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Michal Hocko, Dave Chinner, James Bottomley, Linus Torvalds,
	Waiman Long, Al Viro, Jonathan Corbet, Luis R. Rodriguez,
	Kees Cook, Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Ingo Molnar,
	Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Mon, 16 Jul 2018 05:41:15 -0700 Matthew Wilcox <willy@infradead.org> wrote:
> On Mon, Jul 16, 2018 at 11:09:01AM +0200, Michal Hocko wrote:
> > On Fri 13-07-18 10:36:14, Dave Chinner wrote:
> > [...]
> > > By limiting the number of negative dentries in this case, internal
> > > slab fragmentation is reduced such that reclaim cost never gets out
> > > of control. While it appears to "fix" the symptoms, it doesn't
> > > address the underlying problem. It is a partial solution at best but
> > > at worst it's another opaque knob that nobody knows how or when to
> > > tune.
> > 
> > Would it help to put all the negative dentries into its own slab cache?
> 
> Maybe the dcache should be more sensitive to its own needs.  In __d_alloc,
> it could check whether there are a high proportion of negative dentries
> and start recycling some existing negative dentries.
Well, yes.
The proposed patchset adds all this background reclaiming.  Problem is
a) that background reclaiming sometimes can't keep up so a synchronous
direct-reclaim was added on top and b) reclaiming dentries in the
background will cause non-dentry-allocating tasks to suffer because of
activity from the dentry-allocating tasks, which is inappropriate.
I expect a better design is something like
__d_alloc()
{
	...
	while (too many dentries)
		call the dcache shrinker
	...
}
and that's it.  This way we have a hard upper limit and only the tasks
which are creating dentries suffer the cost.
Regarding the slab page fragmentation issue: I'm wondering if the whole
idea of balancing the slab scan rates against the page scan rates isn't
really working out.  Maybe shrink_slab() should be sitting there
hammering the caches until they have freed up a particular number of
pages.  Quite a big change, conceptually and implementationally.
Aside: about a billion years ago we were having issues with processes
getting stuck in direct reclaim because other processes were coming in
and stealing away the pages which the direct-reclaimer had just freed. 
One possible solution to that was to make direct-reclaiming tasks
release the freed pages into a list on the task_struct.  So those pages
were invisible to other allocating tasks and were available to the
direct-reclaimer when it returned from the reclaim effort.  I forget
what happened to this.
It's quite a small code change and would provide a mechanism for
implementing the hammer-cache-until-youve-freed-enough design above.
Aside 2: if we *do* do something like the above __d_alloc() pseudo code
then perhaps it could be cast in terms of pages, not dentries.  ie,
__d_alloc()
{
	...
	while (too many pages in dentry_cache)
		call the dcache shrinker
	...
}
and, apart from the external name thing (grr), that should address
these fragmentation issues, no?  I assume it's easy to ask slab how
many pages are presently in use for a particular cache.
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16 23:40                                     ` Andrew Morton
@ 2018-07-17  1:30                                       ` Matthew Wilcox
  2018-07-17  8:33                                       ` Michal Hocko
  2018-07-18 18:39                                       ` Waiman Long
  2 siblings, 0 replies; 49+ messages in thread
From: Matthew Wilcox @ 2018-07-17  1:30 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Michal Hocko, Dave Chinner, James Bottomley, Linus Torvalds,
	Waiman Long, Al Viro, Jonathan Corbet, Luis R. Rodriguez,
	Kees Cook, Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Ingo Molnar,
	Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Mon, Jul 16, 2018 at 04:40:32PM -0700, Andrew Morton wrote:
> On Mon, 16 Jul 2018 05:41:15 -0700 Matthew Wilcox <willy@infradead.org> wrote:
> 
> > On Mon, Jul 16, 2018 at 11:09:01AM +0200, Michal Hocko wrote:
> > > On Fri 13-07-18 10:36:14, Dave Chinner wrote:
> > > [...]
> > > > By limiting the number of negative dentries in this case, internal
> > > > slab fragmentation is reduced such that reclaim cost never gets out
> > > > of control. While it appears to "fix" the symptoms, it doesn't
> > > > address the underlying problem. It is a partial solution at best but
> > > > at worst it's another opaque knob that nobody knows how or when to
> > > > tune.
> > > 
> > > Would it help to put all the negative dentries into its own slab cache?
> > 
> > Maybe the dcache should be more sensitive to its own needs.  In __d_alloc,
> > it could check whether there are a high proportion of negative dentries
> > and start recycling some existing negative dentries.
> 
> Well, yes.
> 
> The proposed patchset adds all this background reclaiming.  Problem is
> a) that background reclaiming sometimes can't keep up so a synchronous
> direct-reclaim was added on top and b) reclaiming dentries in the
> background will cause non-dentry-allocating tasks to suffer because of
> activity from the dentry-allocating tasks, which is inappropriate.
... and it's an awful lot of code (almost 600 lines!) to implement
something fairly conceptually simple.
> I expect a better design is something like
> 
> __d_alloc()
> {
> 	...
> 	while (too many dentries)
> 		call the dcache shrinker
> 	...
> }
> 
> and that's it.  This way we have a hard upper limit and only the tasks
> which are creating dentries suffer the cost.
I think the "too many total dentries" is probably handled just fine
by the core MM.  What the dentry cache needs to prevent is adding a
disproportionately large number of useless negative dentries.  
So I'd rather see:
	if (too_many_negative(nr_dentry, nr_dentry_neg))
		reclaim_negative_dentries(16);
	...
16 feels like a fairly natural batch size.  I don't know what
too_many_negative() looks like.  Maybe it's:
bool too_many_negative(unsigned int total, unsigned int neg)
{
	if (neg < 100)
		return false;
	if (neg * 5 < total * 2)
		return false;
	return true;
}
but it could be almost arbitrarily complex.  I do think it needs to
scale with the total number of dentries, not scale with memory size of
the machine or the number of CPUs or anything similar.
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16 23:40                                     ` Andrew Morton
  2018-07-17  1:30                                       ` Matthew Wilcox
@ 2018-07-17  8:33                                       ` Michal Hocko
  2018-07-19  0:33                                         ` Dave Chinner
  2018-07-18 18:39                                       ` Waiman Long
  2 siblings, 1 reply; 49+ messages in thread
From: Michal Hocko @ 2018-07-17  8:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox, Dave Chinner, James Bottomley, Linus Torvalds,
	Waiman Long, Al Viro, Jonathan Corbet, Luis R. Rodriguez,
	Kees Cook, Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Ingo Molnar,
	Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Mon 16-07-18 16:40:32, Andrew Morton wrote:
> On Mon, 16 Jul 2018 05:41:15 -0700 Matthew Wilcox <willy@infradead.org> wrote:
> 
> > On Mon, Jul 16, 2018 at 11:09:01AM +0200, Michal Hocko wrote:
> > > On Fri 13-07-18 10:36:14, Dave Chinner wrote:
> > > [...]
> > > > By limiting the number of negative dentries in this case, internal
> > > > slab fragmentation is reduced such that reclaim cost never gets out
> > > > of control. While it appears to "fix" the symptoms, it doesn't
> > > > address the underlying problem. It is a partial solution at best but
> > > > at worst it's another opaque knob that nobody knows how or when to
> > > > tune.
> > > 
> > > Would it help to put all the negative dentries into its own slab cache?
> > 
> > Maybe the dcache should be more sensitive to its own needs.  In __d_alloc,
> > it could check whether there are a high proportion of negative dentries
> > and start recycling some existing negative dentries.
> 
> Well, yes.
> 
> The proposed patchset adds all this background reclaiming.  Problem is
> a) that background reclaiming sometimes can't keep up so a synchronous
> direct-reclaim was added on top and b) reclaiming dentries in the
> background will cause non-dentry-allocating tasks to suffer because of
> activity from the dentry-allocating tasks, which is inappropriate.
> 
> I expect a better design is something like
> 
> __d_alloc()
> {
> 	...
> 	while (too many dentries)
> 		call the dcache shrinker
> 	...
> }
> 
> and that's it.  This way we have a hard upper limit and only the tasks
> which are creating dentries suffer the cost.
Not really. If the limit is global then everybody who hits the limit
pays regardless how many negative dentries it produced. So if anything
this really has to be per memcg. And then we are at my previous concern,
why do we even really duplicate something that the core MM already tries
to handle - aka keep balance between cached objects. Negative dentries
are not much different from the real page cache in principle. They are
subtly different from the fragmentation point of view which is
unfortunate but this is a general problem we really ought to handle
anyway.
> Regarding the slab page fragmentation issue: I'm wondering if the whole
> idea of balancing the slab scan rates against the page scan rates isn't
> really working out.  Maybe shrink_slab() should be sitting there
> hammering the caches until they have freed up a particular number of
> pages.  Quite a big change, conceptually and implementationally.
> 
> Aside: about a billion years ago we were having issues with processes
> getting stuck in direct reclaim because other processes were coming in
> and stealing away the pages which the direct-reclaimer had just freed. 
> One possible solution to that was to make direct-reclaiming tasks
> release the freed pages into a list on the task_struct.  So those pages
> were invisible to other allocating tasks and were available to the
> direct-reclaimer when it returned from the reclaim effort.  I forget
> what happened to this.
I used to have patches to do that but then justifying them was not that
easy. Most normal workloads do not suffer much and I only had some
artificial ones which are not enough to justify the additional
complexity. Anyway this could be solved also by playing with watermarks
but I haven't explored much yet.
> It's quite a small code change and would provide a mechanism for
> implementing the hammer-cache-until-youve-freed-enough design above.
> 
> 
> 
> Aside 2: if we *do* do something like the above __d_alloc() pseudo code
> then perhaps it could be cast in terms of pages, not dentries.  ie,
> 
> __d_alloc()
> {
> 	...
> 	while (too many pages in dentry_cache)
> 		call the dcache shrinker
> 	...
> }
> 
> and, apart from the external name thing (grr), that should address
> these fragmentation issues, no?  I assume it's easy to ask slab how
> many pages are presently in use for a particular cache.
I remember Dave Chinner had an idea how to age dcache pages to push
dentries with similar live time to the same page. Not sure what happened
to that.
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-17  8:33                                       ` Michal Hocko
@ 2018-07-19  0:33                                         ` Dave Chinner
  2018-07-19  8:45                                           ` Michal Hocko
  0 siblings, 1 reply; 49+ messages in thread
From: Dave Chinner @ 2018-07-19  0:33 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Matthew Wilcox, James Bottomley, Linus Torvalds,
	Waiman Long, Al Viro, Jonathan Corbet, Luis R. Rodriguez,
	Kees Cook, Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Ingo Molnar,
	Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Tue, Jul 17, 2018 at 10:33:26AM +0200, Michal Hocko wrote:
> On Mon 16-07-18 16:40:32, Andrew Morton wrote:
> > On Mon, 16 Jul 2018 05:41:15 -0700 Matthew Wilcox <willy@infradead.org> wrote:
> > It's quite a small code change and would provide a mechanism for
> > implementing the hammer-cache-until-youve-freed-enough design above.
> > 
> > 
> > 
> > Aside 2: if we *do* do something like the above __d_alloc() pseudo code
> > then perhaps it could be cast in terms of pages, not dentries.  ie,
> > 
> > __d_alloc()
> > {
> > 	...
> > 	while (too many pages in dentry_cache)
> > 		call the dcache shrinker
> > 	...
> > }
Direct reclaim will result in all the people who care about long
tail latencies and/or highly concurent workloads starting to hate
you.  Direct reclaim already hammers superblock shrinkers with
excessive concurrency, this would only make it worse.
IOWs, anything like this needs to co-ordinate with other reclaim
operations in progress and, most likely, be done via background
reclaim processing rather than blocking new allocations
indefinitely. background processing can be done in bulk and as
efficiently as possible - concurrent direct reclaim in tiny batches
will just hammer dcache locks and destroy performance when there is
memory pressure.
How many times do we have to learn this lesson the hard way?
> > and, apart from the external name thing (grr), that should address
> > these fragmentation issues, no?  I assume it's easy to ask slab how
> > many pages are presently in use for a particular cache.
> 
> I remember Dave Chinner had an idea how to age dcache pages to push
> dentries with similar live time to the same page. Not sure what happened
> to that.
Same thing that happened to all the "select the dentries on this
page for reclaim". i.e. it's referenced dentries that we can't
reclaim or move that are the issue, not the reclaimable dentries on
the page.
Bsaically, without a hint at allocation time as to the expected life
time of the dentry, we can't be smart about how we select partial
pages to allocate from. And because we don't know at allocation time
if the dentry is going to remain a negative dentry or not, we can't
provide a hint about expected lifetime of teh object being
allocated.
Cheers,
Dave.
-- 
Dave Chinner
david@fromorbit.com
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-19  0:33                                         ` Dave Chinner
@ 2018-07-19  8:45                                           ` Michal Hocko
  2018-07-19  9:13                                             ` Jan Kara
  0 siblings, 1 reply; 49+ messages in thread
From: Michal Hocko @ 2018-07-19  8:45 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Andrew Morton, Matthew Wilcox, James Bottomley, Linus Torvalds,
	Waiman Long, Al Viro, Jonathan Corbet, Luis R. Rodriguez,
	Kees Cook, Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Ingo Molnar,
	Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Thu 19-07-18 10:33:29, Dave Chinner wrote:
> On Tue, Jul 17, 2018 at 10:33:26AM +0200, Michal Hocko wrote:
> > On Mon 16-07-18 16:40:32, Andrew Morton wrote:
> > > On Mon, 16 Jul 2018 05:41:15 -0700 Matthew Wilcox <willy@infradead.org> wrote:
> > > It's quite a small code change and would provide a mechanism for
> > > implementing the hammer-cache-until-youve-freed-enough design above.
> > > 
> > > 
> > > 
> > > Aside 2: if we *do* do something like the above __d_alloc() pseudo code
> > > then perhaps it could be cast in terms of pages, not dentries.  ie,
> > > 
> > > __d_alloc()
> > > {
> > > 	...
> > > 	while (too many pages in dentry_cache)
> > > 		call the dcache shrinker
> > > 	...
> > > }
> 
> Direct reclaim will result in all the people who care about long
> tail latencies and/or highly concurent workloads starting to hate
> you.  Direct reclaim already hammers superblock shrinkers with
> excessive concurrency, this would only make it worse.
I can only confirm that! We have something similar in our SLES kernel.
We have page cache soft limit implemented for many years and it is
basically similar thing to above. We just shrink the page cache when we
have too much of it. It turned out to be a complete PITA on large
machines when hundreds of CPUs are fighting for locks. We have tried to
address that but it is a complete whack a mole.
More important lesson from this is that the original motivation for this
functionality was to not allow too much page cache which would push a
useful DB data out to swap. And as it turned out MM internals have
changed a lot since the introduction and we do not really swap out in
presence of the page cache anymore. Moreover we have a much more
effective reclaim protection thanks to memcg low limit reclaim etc.
While that is all good and nice there are still people tunning the
pagecache limit based on some really old admin guides and the feature
makes more harm than good and we see bug reports that system gets
stalled...
I really do not see why limiting (negative) dentries should be any
different.
> IOWs, anything like this needs to co-ordinate with other reclaim
> operations in progress and, most likely, be done via background
> reclaim processing rather than blocking new allocations
> indefinitely. background processing can be done in bulk and as
> efficiently as possible - concurrent direct reclaim in tiny batches
> will just hammer dcache locks and destroy performance when there is
> memory pressure.
Absolutely agreed!
> How many times do we have to learn this lesson the hard way?
> 
> > > and, apart from the external name thing (grr), that should address
> > > these fragmentation issues, no?  I assume it's easy to ask slab how
> > > many pages are presently in use for a particular cache.
> > 
> > I remember Dave Chinner had an idea how to age dcache pages to push
> > dentries with similar live time to the same page. Not sure what happened
> > to that.
> 
> Same thing that happened to all the "select the dentries on this
> page for reclaim". i.e. it's referenced dentries that we can't
> reclaim or move that are the issue, not the reclaimable dentries on
> the page.
> 
> Bsaically, without a hint at allocation time as to the expected life
> time of the dentry, we can't be smart about how we select partial
> pages to allocate from. And because we don't know at allocation time
> if the dentry is going to remain a negative dentry or not, we can't
> provide a hint about expected lifetime of teh object being
> allocated.
Can we allocate a new dentry at the time when we know the life time or
the dentry pointer is so spread by that time that we cannot?
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-19  8:45                                           ` Michal Hocko
@ 2018-07-19  9:13                                             ` Jan Kara
  0 siblings, 0 replies; 49+ messages in thread
From: Jan Kara @ 2018-07-19  9:13 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Dave Chinner, Andrew Morton, Matthew Wilcox, James Bottomley,
	Linus Torvalds, Waiman Long, Al Viro, Jonathan Corbet,
	Luis R. Rodriguez, Kees Cook, Linux Kernel Mailing List,
	linux-fsdevel, linux-mm, open list:DOCUMENTATION, Jan Kara,
	Paul McKenney, Ingo Molnar, Miklos Szeredi, Larry Woodman,
	Wangkai (Kevin,C)
On Thu 19-07-18 10:45:38, Michal Hocko wrote:
> On Thu 19-07-18 10:33:29, Dave Chinner wrote:
> > > > and, apart from the external name thing (grr), that should address
> > > > these fragmentation issues, no?  I assume it's easy to ask slab how
> > > > many pages are presently in use for a particular cache.
> > > 
> > > I remember Dave Chinner had an idea how to age dcache pages to push
> > > dentries with similar live time to the same page. Not sure what happened
> > > to that.
> > 
> > Same thing that happened to all the "select the dentries on this
> > page for reclaim". i.e. it's referenced dentries that we can't
> > reclaim or move that are the issue, not the reclaimable dentries on
> > the page.
> > 
> > Bsaically, without a hint at allocation time as to the expected life
> > time of the dentry, we can't be smart about how we select partial
> > pages to allocate from. And because we don't know at allocation time
> > if the dentry is going to remain a negative dentry or not, we can't
> > provide a hint about expected lifetime of teh object being
> > allocated.
> 
> Can we allocate a new dentry at the time when we know the life time or
> the dentry pointer is so spread by that time that we cannot?
It's difficult. We allocate dentry, put it in our structures, use it for
synchronization e.g. of parallel lookups of the same name (so for that it is
important that it is visible to everybody) and only after that we ask
filesystem what does it have (if anything) under that name... So delaying
allocation would mean overhauling the locking logic in the whole dcache.
								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply	[flat|nested] 49+ messages in thread 
 
 
 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16 23:40                                     ` Andrew Morton
  2018-07-17  1:30                                       ` Matthew Wilcox
  2018-07-17  8:33                                       ` Michal Hocko
@ 2018-07-18 18:39                                       ` Waiman Long
  2 siblings, 0 replies; 49+ messages in thread
From: Waiman Long @ 2018-07-18 18:39 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox
  Cc: Michal Hocko, Dave Chinner, James Bottomley, Linus Torvalds,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Ingo Molnar,
	Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On 07/16/2018 07:40 PM, Andrew Morton wrote:
> On Mon, 16 Jul 2018 05:41:15 -0700 Matthew Wilcox <willy@infradead.org> wrote:
>
>> On Mon, Jul 16, 2018 at 11:09:01AM +0200, Michal Hocko wrote:
>>> On Fri 13-07-18 10:36:14, Dave Chinner wrote:
>>> [...]
>>>> By limiting the number of negative dentries in this case, internal
>>>> slab fragmentation is reduced such that reclaim cost never gets out
>>>> of control. While it appears to "fix" the symptoms, it doesn't
>>>> address the underlying problem. It is a partial solution at best but
>>>> at worst it's another opaque knob that nobody knows how or when to
>>>> tune.
>>> Would it help to put all the negative dentries into its own slab cache?
>> Maybe the dcache should be more sensitive to its own needs.  In __d_alloc,
>> it could check whether there are a high proportion of negative dentries
>> and start recycling some existing negative dentries.
> Well, yes.
>
> The proposed patchset adds all this background reclaiming.  Problem is
> a) that background reclaiming sometimes can't keep up so a synchronous
> direct-reclaim was added on top and b) reclaiming dentries in the
> background will cause non-dentry-allocating tasks to suffer because of
> activity from the dentry-allocating tasks, which is inappropriate.
I have taken out the background reclaiming in the latest v7 patch for
the concern people have on duplicating the reclaim effort. We can always
add it back on later on if we want to.
> I expect a better design is something like
>
> __d_alloc()
> {
> 	...
> 	while (too many dentries)
> 		call the dcache shrinker
> 	...
> }
>
> and that's it.  This way we have a hard upper limit and only the tasks
> which are creating dentries suffer the cost.
Yes, that is certainly one way of doing it.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread
 
 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-16  9:09                                 ` Michal Hocko
  2018-07-16  9:12                                   ` Michal Hocko
  2018-07-16 12:41                                   ` Matthew Wilcox
@ 2018-07-18 16:17                                   ` Waiman Long
  2018-07-19  8:48                                     ` Michal Hocko
  2 siblings, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-18 16:17 UTC (permalink / raw)
  To: Michal Hocko, Dave Chinner
  Cc: James Bottomley, Linus Torvalds, Matthew Wilcox, Al Viro,
	Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On 07/16/2018 05:09 AM, Michal Hocko wrote:
> On Fri 13-07-18 10:36:14, Dave Chinner wrote:
> [...]
>> By limiting the number of negative dentries in this case, internal
>> slab fragmentation is reduced such that reclaim cost never gets out
>> of control. While it appears to "fix" the symptoms, it doesn't
>> address the underlying problem. It is a partial solution at best but
>> at worst it's another opaque knob that nobody knows how or when to
>> tune.
> Would it help to put all the negative dentries into its own slab cache?
>
>> Very few microbenchmarks expose this internal slab fragmentation
>> problem because they either don't run long enough, don't create
>> memory pressure, or don't have access patterns that mix long and
>> short term slab objects together in a way that causes slab
>> fragmentation. Run some cold cache directory traversals (git
>> status?) at the same time you are creating negative dentries so you
>> create pinned partial pages in the slab cache and see how the
>> behaviour changes....
> Agreed! Slab fragmentation is a real problem we are seeing for quite
> some time. We should try to address it rather than paper over it with
> weird knobs.
I am aware that you don't like the limit knob that control how many
negative dentries are allowed as a percentage of total system memory. I
got comments in the past about doing some kind of auto-tuning. How about
consolidating the 2 knobs that I currently have in the patchset into a
single one with 3 possible values, like:
0 - no limiting
1 - set soft limit to "a constant + 4 x max # of positive dentries" and
warn if exceeded
2 - same limit but kill excess negative dentries after use.
Does that kind of knob make more sense to you?
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-18 16:17                                   ` Waiman Long
@ 2018-07-19  8:48                                     ` Michal Hocko
  0 siblings, 0 replies; 49+ messages in thread
From: Michal Hocko @ 2018-07-19  8:48 UTC (permalink / raw)
  To: Waiman Long
  Cc: Dave Chinner, James Bottomley, Linus Torvalds, Matthew Wilcox,
	Al Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	Linux Kernel Mailing List, linux-fsdevel, linux-mm,
	open list:DOCUMENTATION, Jan Kara, Paul McKenney, Andrew Morton,
	Ingo Molnar, Miklos Szeredi, Larry Woodman, Wangkai (Kevin,C)
On Wed 18-07-18 12:17:24, Waiman Long wrote:
> On 07/16/2018 05:09 AM, Michal Hocko wrote:
> > On Fri 13-07-18 10:36:14, Dave Chinner wrote:
> > [...]
> >> By limiting the number of negative dentries in this case, internal
> >> slab fragmentation is reduced such that reclaim cost never gets out
> >> of control. While it appears to "fix" the symptoms, it doesn't
> >> address the underlying problem. It is a partial solution at best but
> >> at worst it's another opaque knob that nobody knows how or when to
> >> tune.
> > Would it help to put all the negative dentries into its own slab cache?
> >
> >> Very few microbenchmarks expose this internal slab fragmentation
> >> problem because they either don't run long enough, don't create
> >> memory pressure, or don't have access patterns that mix long and
> >> short term slab objects together in a way that causes slab
> >> fragmentation. Run some cold cache directory traversals (git
> >> status?) at the same time you are creating negative dentries so you
> >> create pinned partial pages in the slab cache and see how the
> >> behaviour changes....
> > Agreed! Slab fragmentation is a real problem we are seeing for quite
> > some time. We should try to address it rather than paper over it with
> > weird knobs.
> 
> I am aware that you don't like the limit knob that control how many
> negative dentries are allowed as a percentage of total system memory. I
> got comments in the past about doing some kind of auto-tuning. How about
> consolidating the 2 knobs that I currently have in the patchset into a
> single one with 3 possible values, like:
> 
> 0 - no limiting
> 1 - set soft limit to "a constant + 4 x max # of positive dentries" and
> warn if exceeded
> 2 - same limit but kill excess negative dentries after use.
> 
> Does that kind of knob make more sense to you?
Not really. See the pagecache limit story in http://lkml.kernel.org/r/20180719084538.GP7193@dhcp22.suse.cz
I might be overly sensitive but I got burnt a lot in the past. We should
strive to make the reclaim seamless without asking admins to do our job.
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread 
 
 
 
 
 
 
 
 
 
 
 
 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-11 15:13           ` Waiman Long
  2018-07-11 17:42             ` James Bottomley
@ 2018-07-12  8:48             ` Michal Hocko
  2018-07-12 16:12               ` Waiman Long
  1 sibling, 1 reply; 49+ messages in thread
From: Michal Hocko @ 2018-07-12  8:48 UTC (permalink / raw)
  To: Waiman Long
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On Wed 11-07-18 11:13:58, Waiman Long wrote:
> On 07/11/2018 06:21 AM, Michal Hocko wrote:
> > On Tue 10-07-18 12:09:17, Waiman Long wrote:
> >> On 07/10/2018 10:27 AM, Michal Hocko wrote:
> >>> On Mon 09-07-18 12:01:04, Waiman Long wrote:
> >>>> On 07/09/2018 04:19 AM, Michal Hocko wrote:
> > [...]
> >>>>> percentage has turned out to be a really wrong unit for many tunables
> >>>>> over time. Even 1% can be just too much on really large machines.
> >>>> Yes, that is true. Do you have any suggestion of what kind of unit
> >>>> should be used? I can scale down the unit to 0.1% of the system memory.
> >>>> Alternatively, one unit can be 10k/cpu thread, so a 20-thread system
> >>>> corresponds to 200k, etc.
> >>> I simply think this is a strange user interface. How much is a
> >>> reasonable number? How can any admin figure that out?
> >> Without the optional enforcement, the limit is essentially just a
> >> notification mechanism where the system signals that there is something
> >> wrong going on and the system administrator need to take a look. So it
> >> is perfectly OK if the limit is sufficiently high that normally we won't
> >> need to use that many negative dentries. The goal is to prevent negative
> >> dentries from consuming a significant portion of the system memory.
> > So again. How do you tell the right number?
> 
> I guess it will be more a trial and error kind of adjustment as the
> right figure will depend on the kind of workloads being run on the
> system. So unless the enforcement option is turned on, setting a limit
> that is too small won't have too much impact over than a slight
> performance drop because of the invocation of the slowpaths and the
> warning messages in the console. Whenever a non-zero value is written
> into "neg-dentry-limit", an informational message will be printed about
> what the actual negative dentry limits
> will be. It can be compared against the current negative dentry number
> (5th number) from "dentry-state" to see if there is enough safe margin
> to avoid false positive warning.
What you wrote above is exactly the reason why I do not like yet another
tunable. If you cannot give a reasonable cook book on how to tune this
properly then nobody will really use it and we will eventually find
out that we have a user visible API which might simply make further
development harder and which will be hard to get rid of because you
never know who is going to use it for strange purposes.
Really, negative entries are a cache and if we do not shrink that cache
properly then this should be fixed rather than giving up and pretending
that the admin is the one to control that.
-- 
Michal Hocko
SUSE Labs
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12  8:48             ` Michal Hocko
@ 2018-07-12 16:12               ` Waiman Long
  2018-07-12 23:16                 ` Andrew Morton
  0 siblings, 1 reply; 49+ messages in thread
From: Waiman Long @ 2018-07-12 16:12 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Alexander Viro, Jonathan Corbet, Luis R. Rodriguez, Kees Cook,
	linux-kernel, linux-fsdevel, linux-mm, linux-doc, Linus Torvalds,
	Jan Kara, Paul E. McKenney, Andrew Morton, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On 07/12/2018 04:48 AM, Michal Hocko wrote:
> On Wed 11-07-18 11:13:58, Waiman Long wrote:
>> On 07/11/2018 06:21 AM, Michal Hocko wrote:
>>> On Tue 10-07-18 12:09:17, Waiman Long wrote:
>>>> On 07/10/2018 10:27 AM, Michal Hocko wrote:
>>>>> On Mon 09-07-18 12:01:04, Waiman Long wrote:
>>>>>> On 07/09/2018 04:19 AM, Michal Hocko wrote:
>>> [...]
>>>>>>> percentage has turned out to be a really wrong unit for many tunables
>>>>>>> over time. Even 1% can be just too much on really large machines.
>>>>>> Yes, that is true. Do you have any suggestion of what kind of unit
>>>>>> should be used? I can scale down the unit to 0.1% of the system memory.
>>>>>> Alternatively, one unit can be 10k/cpu thread, so a 20-thread system
>>>>>> corresponds to 200k, etc.
>>>>> I simply think this is a strange user interface. How much is a
>>>>> reasonable number? How can any admin figure that out?
>>>> Without the optional enforcement, the limit is essentially just a
>>>> notification mechanism where the system signals that there is something
>>>> wrong going on and the system administrator need to take a look. So it
>>>> is perfectly OK if the limit is sufficiently high that normally we won't
>>>> need to use that many negative dentries. The goal is to prevent negative
>>>> dentries from consuming a significant portion of the system memory.
>>> So again. How do you tell the right number?
>> I guess it will be more a trial and error kind of adjustment as the
>> right figure will depend on the kind of workloads being run on the
>> system. So unless the enforcement option is turned on, setting a limit
>> that is too small won't have too much impact over than a slight
>> performance drop because of the invocation of the slowpaths and the
>> warning messages in the console. Whenever a non-zero value is written
>> into "neg-dentry-limit", an informational message will be printed about
>> what the actual negative dentry limits
>> will be. It can be compared against the current negative dentry number
>> (5th number) from "dentry-state" to see if there is enough safe margin
>> to avoid false positive warning.
> What you wrote above is exactly the reason why I do not like yet another
> tunable. If you cannot give a reasonable cook book on how to tune this
> properly then nobody will really use it and we will eventually find
> out that we have a user visible API which might simply make further
> development harder and which will be hard to get rid of because you
> never know who is going to use it for strange purposes.
>
> Really, negative entries are a cache and if we do not shrink that cache
> properly then this should be fixed rather than giving up and pretending
> that the admin is the one to control that.
The rationale beside this patchset comes from a customer request to have
the ability to track and limit negative dentries. The goal is to not
have a disproportionate amount of memory being consumed by negative
dentries. Setting the limit and reaching it does nothing other than
gives out a warning about the limit being breached unless the enforce
option is turned on which is for the paranoids.
There is no one right value for the limit. It all depends on what the
users think is a disproportionate amount of memory.
Cheers,
Longman
^ permalink raw reply	[flat|nested] 49+ messages in thread 
- * Re: [PATCH v6 0/7] fs/dcache: Track & limit # of negative dentries
  2018-07-12 16:12               ` Waiman Long
@ 2018-07-12 23:16                 ` Andrew Morton
  0 siblings, 0 replies; 49+ messages in thread
From: Andrew Morton @ 2018-07-12 23:16 UTC (permalink / raw)
  To: Waiman Long
  Cc: Michal Hocko, Alexander Viro, Jonathan Corbet, Luis R. Rodriguez,
	Kees Cook, linux-kernel, linux-fsdevel, linux-mm, linux-doc,
	Linus Torvalds, Jan Kara, Paul E. McKenney, Ingo Molnar,
	Miklos Szeredi, Matthew Wilcox, Larry Woodman, James Bottomley,
	Wangkai (Kevin C)
On Thu, 12 Jul 2018 12:12:28 -0400 Waiman Long <longman@redhat.com> wrote:
> The rationale beside this patchset comes from a customer request to have
> the ability to track and limit negative dentries. 
Please go back to customer and ask them "why", then let us know.
Could I suggest you stop working on implementation things and instead
work on preparing a comprehensive bug report?  Describe the workload,
describe the system behavior, describe why it is problematic, describe
the preferred behavior, etc.
Once we have that understanding, it might be that we eventually agree
that the problem is unfixable using existing memory management
techniques and that it is indeed appropriate that we add a lot more
code which essentially duplicates kswapd functionality and which
essentially duplicates direct reclaim functionality.  But I sure hope
not.
^ permalink raw reply	[flat|nested] 49+ messages in thread