diff for duplicates of <20170918163434.GA11236@cmpxchg.org>

All of lore.kernel.org
 help / color / mirror / Atom feed

diff for duplicates of <20170918163434.GA11236@cmpxchg.org>

diff --git a/a/2.txt b/N1/2.txt
index 8b13789..e97ac5e 100644
--- a/a/2.txt
+++ b/N1/2.txt
@@ -1 +1,114 @@
+>From d5ffeb4d9d65fcff1b7e50dbde8264b4c32824a5 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 14 Jun 2017 11:12:05 -0400
+Subject: [PATCH 1/3] sched/loadavg: consolidate LOAD_INT, LOAD_FRAC macros
 
+There are several identical definitions of those macros in places that
+mess with fixed-point load averages. Provide an official version.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+---
+ arch/powerpc/platforms/cell/spufs/sched.c | 3 ---
+ arch/s390/appldata/appldata_os.c          | 4 ----
+ drivers/cpuidle/governors/menu.c          | 4 ----
+ fs/proc/loadavg.c                         | 3 ---
+ include/linux/sched/loadavg.h             | 3 +++
+ kernel/debug/kdb/kdb_main.c               | 7 +------
+ 6 files changed, 4 insertions(+), 20 deletions(-)
+
+diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
+index 1fbb5da17dd2..de544070def3 100644
+--- a/arch/powerpc/platforms/cell/spufs/sched.c
++++ b/arch/powerpc/platforms/cell/spufs/sched.c
+@@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx,
+ 	}
+ }
+ 
+-#define LOAD_INT(x) ((x) >> FSHIFT)
+-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+-
+ static int show_spu_loadavg(struct seq_file *s, void *private)
+ {
+ 	int a, b, c;
+diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
+index 45b3178200ab..a8aac17e1e82 100644
+--- a/arch/s390/appldata/appldata_os.c
++++ b/arch/s390/appldata/appldata_os.c
+@@ -24,10 +24,6 @@
+ 
+ #include "appldata.h"
+ 
+-
+-#define LOAD_INT(x) ((x) >> FSHIFT)
+-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+-
+ /*
+  * OS data
+  *
+diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
+index 61b64c2b2cb8..e215a2c10a61 100644
+--- a/drivers/cpuidle/governors/menu.c
++++ b/drivers/cpuidle/governors/menu.c
+@@ -132,10 +132,6 @@ struct menu_device {
+ 	int		interval_ptr;
+ };
+ 
+-
+-#define LOAD_INT(x) ((x) >> FSHIFT)
+-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+-
+ static inline int get_loadavg(unsigned long load)
+ {
+ 	return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;
+diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
+index 983fce5c2418..111a25e4b088 100644
+--- a/fs/proc/loadavg.c
++++ b/fs/proc/loadavg.c
+@@ -9,9 +9,6 @@
+ #include <linux/seqlock.h>
+ #include <linux/time.h>
+ 
+-#define LOAD_INT(x) ((x) >> FSHIFT)
+-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+-
+ static int loadavg_proc_show(struct seq_file *m, void *v)
+ {
+ 	unsigned long avnrun[3];
+diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h
+index 4264bc6b2c27..745483bb5cca 100644
+--- a/include/linux/sched/loadavg.h
++++ b/include/linux/sched/loadavg.h
+@@ -26,6 +26,9 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+ 	load += n*(FIXED_1-exp); \
+ 	load >>= FSHIFT;
+ 
++#define LOAD_INT(x) ((x) >> FSHIFT)
++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
++
+ extern void calc_global_load(unsigned long ticks);
+ 
+ #endif /* _LINUX_SCHED_LOADAVG_H */
+diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
+index c8146d53ca67..2dddd25ccd7a 100644
+--- a/kernel/debug/kdb/kdb_main.c
++++ b/kernel/debug/kdb/kdb_main.c
+@@ -2571,16 +2571,11 @@ static int kdb_summary(int argc, const char **argv)
+ 	}
+ 	kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
+ 
+-	/* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
+-
+-#define LOAD_INT(x) ((x) >> FSHIFT)
+-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+ 	kdb_printf("load avg   %ld.%02ld %ld.%02ld %ld.%02ld\n",
+ 		LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
+ 		LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
+ 		LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
+-#undef LOAD_INT
+-#undef LOAD_FRAC
++
+ 	/* Display in kilobytes */
+ #define K(x) ((x) << (PAGE_SHIFT - 10))
+ 	kdb_printf("\nMemTotal:       %8lu kB\nMemFree:        %8lu kB\n"
+-- 
+2.14.1
diff --git a/N1/3.hdr b/N1/3.hdr
new file mode 100644
index 0000000..c0eac16
--- /dev/null
+++ b/N1/3.hdr
@@ -0,0 +1,2 @@
+Content-Type: text/x-diff; charset=us-ascii
+Content-Disposition: attachment; filename="0002-mm-workingset-tell-cache-transitions-from-workingset.patch"
diff --git a/N1/3.txt b/N1/3.txt
new file mode 100644
index 0000000..8419633
--- /dev/null
+++ b/N1/3.txt
@@ -0,0 +1,407 @@
+>From 4ccc6444efbdcc30680eff6b8f345511c306f3d7 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 2 Mar 2017 09:58:03 -0500
+Subject: [PATCH 2/3] mm: workingset: tell cache transitions from workingset
+ thrashing
+
+Refaults happen during transitions between workingsets as well as
+in-place thrashing. Knowing the difference between the two has a range
+of applications, including measuring the impact of memory shortage on
+the system performance, as well as the ability to smarter balance
+pressure between the filesystem cache and the swap-backed workingset.
+
+During workingset transitions, inactive cache refaults and pushes out
+established active cache. When that active cache isn't stale, however,
+and also ends up refaulting, that's bonafide thrashing.
+
+Introduce a new page flag that tells on eviction whether the page has
+been active or not in its lifetime. This bit is then stored in the
+shadow entry, to classify refaults as transitioning or thrashing.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+---
+ include/linux/mmzone.h         |  1 +
+ include/linux/page-flags.h     |  5 ++-
+ include/linux/swap.h           |  2 +-
+ include/trace/events/mmflags.h |  1 +
+ mm/filemap.c                   |  9 ++--
+ mm/huge_memory.c               |  1 +
+ mm/memcontrol.c                |  2 +
+ mm/migrate.c                   |  2 +
+ mm/swap_state.c                |  1 +
+ mm/vmscan.c                    |  1 +
+ mm/vmstat.c                    |  1 +
+ mm/workingset.c                | 96 +++++++++++++++++++++++++++---------------
+ 12 files changed, 79 insertions(+), 43 deletions(-)
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index fc14b8b3f6ce..b8726b501166 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -156,6 +156,7 @@ enum node_stat_item {
+ 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
+ 	WORKINGSET_REFAULT,
+ 	WORKINGSET_ACTIVATE,
++	WORKINGSET_RESTORE,
+ 	WORKINGSET_NODERECLAIM,
+ 	NR_ANON_MAPPED,	/* Mapped anonymous pages */
+ 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
+diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
+index d33e3280c8ad..f889af1a6aed 100644
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -73,13 +73,14 @@
+  */
+ enum pageflags {
+ 	PG_locked,		/* Page is locked. Don't touch. */
+-	PG_error,
+ 	PG_referenced,
+ 	PG_uptodate,
+ 	PG_dirty,
+ 	PG_lru,
+ 	PG_active,
++	PG_workingset,
+ 	PG_waiters,		/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
++	PG_error,
+ 	PG_slab,
+ 	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
+ 	PG_arch_1,
+@@ -272,6 +273,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
+ PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
+ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
+ 	TESTCLEARFLAG(Active, active, PF_HEAD)
++PAGEFLAG(Workingset, workingset, PF_HEAD)
++	TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
+ __PAGEFLAG(Slab, slab, PF_NO_TAIL)
+ __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)
+ PAGEFLAG(Checked, checked, PF_NO_COMPOUND)	   /* Used by some filesystems */
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index d83d28e53e62..914a173beee1 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -252,7 +252,7 @@ struct swap_info_struct {
+ 
+ /* linux/mm/workingset.c */
+ void *workingset_eviction(struct address_space *mapping, struct page *page);
+-bool workingset_refault(void *shadow);
++void workingset_refault(struct page *page, void *shadow);
+ void workingset_activation(struct page *page);
+ void workingset_update_node(struct radix_tree_node *node, void *private);
+ 
+diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
+index 8e50d01c645f..aac9eb272754 100644
+--- a/include/trace/events/mmflags.h
++++ b/include/trace/events/mmflags.h
+@@ -90,6 +90,7 @@
+ 	{1UL << PG_dirty,		"dirty"		},		\
+ 	{1UL << PG_lru,			"lru"		},		\
+ 	{1UL << PG_active,		"active"	},		\
++	{1UL << PG_workingset,		"workingset"	},		\
+ 	{1UL << PG_slab,		"slab"		},		\
+ 	{1UL << PG_owner_priv_1,	"owner_priv_1"	},		\
+ 	{1UL << PG_arch_1,		"arch_1"	},		\
+diff --git a/mm/filemap.c b/mm/filemap.c
+index 65b4b6e7f7bd..da55a5693da9 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -823,12 +823,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ 		 * data from the working set, only to cache data that will
+ 		 * get overwritten with something else, is a waste of memory.
+ 		 */
+-		if (!(gfp_mask & __GFP_WRITE) &&
+-		    shadow && workingset_refault(shadow)) {
+-			SetPageActive(page);
+-			workingset_activation(page);
+-		} else
+-			ClearPageActive(page);
++		WARN_ON_ONCE(PageActive(page));
++		if (!(gfp_mask & __GFP_WRITE) && shadow)
++			workingset_refault(page, shadow);
+ 		lru_cache_add(page);
+ 	}
+ 	return ret;
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 90731e3b7e58..b18ac8084c2a 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2239,6 +2239,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
+ 			 (1L << PG_mlocked) |
+ 			 (1L << PG_uptodate) |
+ 			 (1L << PG_active) |
++			 (1L << PG_workingset) |
+ 			 (1L << PG_locked) |
+ 			 (1L << PG_unevictable) |
+ 			 (1L << PG_dirty)));
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index e09741af816f..93b2eb063afd 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -5274,6 +5274,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
+ 		   stat[WORKINGSET_REFAULT]);
+ 	seq_printf(m, "workingset_activate %lu\n",
+ 		   stat[WORKINGSET_ACTIVATE]);
++	seq_printf(m, "workingset_restore %lu\n",
++		   stat[WORKINGSET_RESTORE]);
+ 	seq_printf(m, "workingset_nodereclaim %lu\n",
+ 		   stat[WORKINGSET_NODERECLAIM]);
+ 
+diff --git a/mm/migrate.c b/mm/migrate.c
+index e84eeb4e4356..48f4a79869ce 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -624,6 +624,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
+ 		SetPageActive(newpage);
+ 	} else if (TestClearPageUnevictable(page))
+ 		SetPageUnevictable(newpage);
++	if (PageWorkingset(page))
++		SetPageWorkingset(newpage);
+ 	if (PageChecked(page))
+ 		SetPageChecked(newpage);
+ 	if (PageMappedToDisk(page))
+diff --git a/mm/swap_state.c b/mm/swap_state.c
+index b68c93014f50..b39b3969be07 100644
+--- a/mm/swap_state.c
++++ b/mm/swap_state.c
+@@ -387,6 +387,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ 			/*
+ 			 * Initiate read into locked page and return.
+ 			 */
++			SetPageWorkingset(new_page);
+ 			lru_cache_add_anon(new_page);
+ 			*new_page_allocated = true;
+ 			return new_page;
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index a1af041930a6..60357cd84c67 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2022,6 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
+ 		}
+ 
+ 		ClearPageActive(page);	/* we are de-activating */
++		SetPageWorkingset(page);
+ 		list_add(&page->lru, &l_inactive);
+ 	}
+ 
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 9a4441bbeef2..87ce53498828 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -956,6 +956,7 @@ const char * const vmstat_text[] = {
+ 	"nr_isolated_file",
+ 	"workingset_refault",
+ 	"workingset_activate",
++	"workingset_restore",
+ 	"workingset_nodereclaim",
+ 	"nr_anon_pages",
+ 	"nr_mapped",
+diff --git a/mm/workingset.c b/mm/workingset.c
+index 7119cd745ace..264f0498f2bc 100644
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -120,7 +120,7 @@
+  * the only thing eating into inactive list space is active pages.
+  *
+  *
+- *		Activating refaulting pages
++ *		Refaulting inactive pages
+  *
+  * All that is known about the active list is that the pages have been
+  * accessed more than once in the past.  This means that at any given
+@@ -133,6 +133,10 @@
+  * used less frequently than the refaulting page - or even not used at
+  * all anymore.
+  *
++ * That means if inactive cache is refaulting with a suitable refault
++ * distance, we assume the cache workingset is transitioning and put
++ * pressure on the current active list.
++ *
+  * If this is wrong and demotion kicks in, the pages which are truly
+  * used more frequently will be reactivated while the less frequently
+  * used once will be evicted from memory.
+@@ -140,6 +144,14 @@
+  * But if this is right, the stale pages will be pushed out of memory
+  * and the used pages get to stay in cache.
+  *
++ *		Refaulting active pages
++ *
++ * If on the other hand the refaulting pages have recently been
++ * deactivated, it means that the active list is no longer protecting
++ * actively used cache from reclaim. The cache is NOT transitioning to
++ * a different workingset; the existing workingset is thrashing in the
++ * space allocated to the page cache.
++ *
+  *
+  *		Implementation
+  *
+@@ -155,8 +167,7 @@
+  */
+ 
+ #define EVICTION_SHIFT	(RADIX_TREE_EXCEPTIONAL_ENTRY + \
+-			 NODES_SHIFT +	\
+-			 MEM_CGROUP_ID_SHIFT)
++			 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+ #define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
+ 
+ /*
+@@ -169,23 +180,28 @@
+  */
+ static unsigned int bucket_order __read_mostly;
+ 
+-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
++static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
++			 bool workingset)
+ {
+ 	eviction >>= bucket_order;
+ 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
+ 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
++	eviction = (eviction << 1) | workingset;
+ 	eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
+ 
+ 	return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ }
+ 
+ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
+-			  unsigned long *evictionp)
++			  unsigned long *evictionp, bool *workingsetp)
+ {
+ 	unsigned long entry = (unsigned long)shadow;
+ 	int memcgid, nid;
++	bool workingset;
+ 
+ 	entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
++	workingset = entry & 1;
++	entry >>= 1;
+ 	nid = entry & ((1UL << NODES_SHIFT) - 1);
+ 	entry >>= NODES_SHIFT;
+ 	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+@@ -194,6 +210,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
+ 	*memcgidp = memcgid;
+ 	*pgdat = NODE_DATA(nid);
+ 	*evictionp = entry << bucket_order;
++	*workingsetp = workingset;
+ }
+ 
+ /**
+@@ -206,8 +223,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
+  */
+ void *workingset_eviction(struct address_space *mapping, struct page *page)
+ {
+-	struct mem_cgroup *memcg = page_memcg(page);
+ 	struct pglist_data *pgdat = page_pgdat(page);
++	struct mem_cgroup *memcg = page_memcg(page);
+ 	int memcgid = mem_cgroup_id(memcg);
+ 	unsigned long eviction;
+ 	struct lruvec *lruvec;
+@@ -219,30 +236,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
+ 
+ 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ 	eviction = atomic_long_inc_return(&lruvec->inactive_age);
+-	return pack_shadow(memcgid, pgdat, eviction);
++	return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ }
+ 
+ /**
+  * workingset_refault - evaluate the refault of a previously evicted page
++ * @page: the freshly allocated replacement page
+  * @shadow: shadow entry of the evicted page
+  *
+  * Calculates and evaluates the refault distance of the previously
+  * evicted page in the context of the node it was allocated in.
+- *
+- * Returns %true if the page should be activated, %false otherwise.
+  */
+-bool workingset_refault(void *shadow)
++void workingset_refault(struct page *page, void *shadow)
+ {
+ 	unsigned long refault_distance;
++	struct pglist_data *pgdat;
+ 	unsigned long active_file;
+ 	struct mem_cgroup *memcg;
+ 	unsigned long eviction;
+ 	struct lruvec *lruvec;
+ 	unsigned long refault;
+-	struct pglist_data *pgdat;
++	bool workingset;
+ 	int memcgid;
+ 
+-	unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
++	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ 
+ 	rcu_read_lock();
+ 	/*
+@@ -262,41 +279,50 @@ bool workingset_refault(void *shadow)
+ 	 * configurations instead.
+ 	 */
+ 	memcg = mem_cgroup_from_id(memcgid);
+-	if (!mem_cgroup_disabled() && !memcg) {
+-		rcu_read_unlock();
+-		return false;
+-	}
++	if (!mem_cgroup_disabled() && !memcg)
++		goto out;
+ 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ 	refault = atomic_long_read(&lruvec->inactive_age);
+ 	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
+ 
+ 	/*
+-	 * The unsigned subtraction here gives an accurate distance
+-	 * across inactive_age overflows in most cases.
++	 * Calculate the refault distance
+ 	 *
+-	 * There is a special case: usually, shadow entries have a
+-	 * short lifetime and are either refaulted or reclaimed along
+-	 * with the inode before they get too old.  But it is not
+-	 * impossible for the inactive_age to lap a shadow entry in
+-	 * the field, which can then can result in a false small
+-	 * refault distance, leading to a false activation should this
+-	 * old entry actually refault again.  However, earlier kernels
+-	 * used to deactivate unconditionally with *every* reclaim
+-	 * invocation for the longest time, so the occasional
+-	 * inappropriate activation leading to pressure on the active
+-	 * list is not a problem.
++	 * The unsigned subtraction here gives an accurate distance
++	 * across inactive_age overflows in most cases. There is a
++	 * special case: usually, shadow entries have a short lifetime
++	 * and are either refaulted or reclaimed along with the inode
++	 * before they get too old.  But it is not impossible for the
++	 * inactive_age to lap a shadow entry in the field, which can
++	 * then can result in a false small refault distance, leading
++	 * to a false activation should this old entry actually
++	 * refault again.  However, earlier kernels used to deactivate
++	 * unconditionally with *every* reclaim invocation for the
++	 * longest time, so the occasional inappropriate activation
++	 * leading to pressure on the active list is not a problem.
+ 	 */
+ 	refault_distance = (refault - eviction) & EVICTION_MASK;
+ 
+ 	inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
+ 
+-	if (refault_distance <= active_file) {
+-		inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+-		rcu_read_unlock();
+-		return true;
+-	}
++	/*
++	 * Compare the distance to the existing workingset size. We
++	 * don't act on pages that couldn't stay resident even if all
++	 * the memory was available to the page cache.
++	 */
++	if (refault_distance > active_file)
++		goto out;
++
++	SetPageActive(page);
++	SetPageWorkingset(page);
++	atomic_long_inc(&lruvec->inactive_age);
++	inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
++
++	/* Page was active prior to eviction */
++	if (workingset)
++		inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
++out:
+ 	rcu_read_unlock();
+-	return false;
+ }
+ 
+ /**
+-- 
+2.14.1
diff --git a/N1/4.hdr b/N1/4.hdr
new file mode 100644
index 0000000..59b4563
--- /dev/null
+++ b/N1/4.hdr
@@ -0,0 +1,2 @@
+Content-Type: text/x-diff; charset=us-ascii
+Content-Disposition: attachment; filename="0003-mm-sched-memdelay-memory-health-interface-for-system.patch"
diff --git a/N1/4.txt b/N1/4.txt
new file mode 100644
index 0000000..a700eaf
--- /dev/null
+++ b/N1/4.txt
@@ -0,0 +1,1209 @@
+>From c3e97f5daf99bcd54383eaab466c477dbb743dd9 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Mon, 5 Jun 2017 16:07:22 -0400
+Subject: [PATCH 3/3] mm/sched: memdelay: memory health interface for systems
+ and workloads
+
+Linux doesn't have a useful metric to describe the memory health of a
+system, a cgroup container, or individual tasks.
+
+When workloads are bigger than available memory, they spend a certain
+amount of their time inside page reclaim, waiting on thrashing cache,
+and swapping in. This has impact on latency, and depending on the CPU
+capacity in the system can also translate to a decrease in throughput.
+
+While Linux exports some stats and counters for these events, it does
+not quantify the true impact they have on throughput and latency. How
+much of the execution time is spent unproductively? This is important
+to know when sizing workloads to systems and containers. It also comes
+in handy when evaluating the effectiveness and efficiency of the
+kernel's memory management policies and heuristics.
+
+This patch implements a metric that quantifies memory pressure in a
+unit that matters most to applications and does not rely on hardware
+aspects to be meaningful: wallclock time lost while waiting on memory.
+
+Whenever a task is blocked on refaults, swapins, or direct reclaim,
+the time it spends is accounted on the task level and aggregated into
+a domain state along with other tasks on the system and cgroup level.
+
+Each task has a /proc/<pid>/memdelay file that lists the microseconds
+the task has been delayed since it's been forked. That file can be
+sampled periodically for recent delays, or before and after certain
+operations to measure their memory-related latencies.
+
+On the system and cgroup-level, there are /proc/memdelay and
+memory.memdelay, respectively, and their format is as such:
+
+$ cat /proc/memdelay
+2489084
+41.61 47.28 29.66
+0.00 0.00 0.00
+
+The first line shows the cumulative delay times of all tasks in the
+domain - in this case, all tasks in the system cumulatively lost 2.49
+seconds due to memory delays.
+
+The second and third line show percentages spent in aggregate states
+for the domain - system or cgroup - in a load average type format as
+decaying averages over the last 1m, 5m, and 15m:
+
+The second line indicates the share of wall-time the domain spends in
+a state where SOME tasks are delayed by memory while others are still
+productive (runnable or iowait). This indicates a latency problem for
+individual tasks, but since the CPU/IO capacity is still used, adding
+more memory might not necessarily improve the domain's throughput.
+
+The third line indicates the share of wall-time the domain spends in a
+state where ALL non-idle tasks are delayed by memory. In this state,
+the domain is entirely unproductive due to a lack of memory.
+
+v2:
+- fix active-delay condition when only other runnables, no iowait
+- drop private lock from sched path, we can use the rq lock
+- fix refault vs. simple lockwait detection
+- drop ktime, we can use cpu_clock()
+
+XXX:
+- eliminate redundant cgroup hierarchy walks in the scheduler
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+---
+ fs/proc/array.c            |   8 ++
+ fs/proc/base.c             |   2 +
+ fs/proc/internal.h         |   2 +
+ include/linux/memcontrol.h |  14 +++
+ include/linux/memdelay.h   | 182 +++++++++++++++++++++++++++++
+ include/linux/sched.h      |   8 ++
+ kernel/cgroup/cgroup.c     |   3 +-
+ kernel/fork.c              |   4 +
+ kernel/sched/Makefile      |   2 +-
+ kernel/sched/core.c        |  27 +++++
+ kernel/sched/memdelay.c    | 118 +++++++++++++++++++
+ mm/Makefile                |   2 +-
+ mm/compaction.c            |   4 +
+ mm/filemap.c               |  11 ++
+ mm/memcontrol.c            |  25 ++++
+ mm/memdelay.c              | 285 +++++++++++++++++++++++++++++++++++++++++++++
+ mm/page_alloc.c            |  11 +-
+ mm/vmscan.c                |   9 ++
+ 18 files changed, 712 insertions(+), 5 deletions(-)
+ create mode 100644 include/linux/memdelay.h
+ create mode 100644 kernel/sched/memdelay.c
+ create mode 100644 mm/memdelay.c
+
+diff --git a/fs/proc/array.c b/fs/proc/array.c
+index 88c355574aa0..00e0e9aa3e70 100644
+--- a/fs/proc/array.c
++++ b/fs/proc/array.c
+@@ -611,6 +611,14 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
+ 	return 0;
+ }
+ 
++int proc_pid_memdelay(struct seq_file *m, struct pid_namespace *ns,
++		      struct pid *pid, struct task_struct *task)
++{
++	seq_put_decimal_ull(m, "", task->memdelay_total);
++	seq_putc(m, '\n');
++	return 0;
++}
++
+ #ifdef CONFIG_PROC_CHILDREN
+ static struct pid *
+ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
+diff --git a/fs/proc/base.c b/fs/proc/base.c
+index 719c2e943ea1..19f194940c80 100644
+--- a/fs/proc/base.c
++++ b/fs/proc/base.c
+@@ -2916,6 +2916,7 @@ static const struct pid_entry tgid_base_stuff[] = {
+ 	REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
+ 	ONE("stat",       S_IRUGO, proc_tgid_stat),
+ 	ONE("statm",      S_IRUGO, proc_pid_statm),
++	ONE("memdelay",   S_IRUGO, proc_pid_memdelay),
+ 	REG("maps",       S_IRUGO, proc_pid_maps_operations),
+ #ifdef CONFIG_NUMA
+ 	REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
+@@ -3307,6 +3308,7 @@ static const struct pid_entry tid_base_stuff[] = {
+ 	REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
+ 	ONE("stat",      S_IRUGO, proc_tid_stat),
+ 	ONE("statm",     S_IRUGO, proc_pid_statm),
++	ONE("memdelay",  S_IRUGO, proc_pid_memdelay),
+ 	REG("maps",      S_IRUGO, proc_tid_maps_operations),
+ #ifdef CONFIG_PROC_CHILDREN
+ 	REG("children",  S_IRUGO, proc_tid_children_operations),
+diff --git a/fs/proc/internal.h b/fs/proc/internal.h
+index aa2b89071630..7ab706c316b8 100644
+--- a/fs/proc/internal.h
++++ b/fs/proc/internal.h
+@@ -146,6 +146,8 @@ extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
+ 			   struct pid *, struct task_struct *);
+ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
+ 			  struct pid *, struct task_struct *);
++extern int proc_pid_memdelay(struct seq_file *, struct pid_namespace *,
++			     struct pid *, struct task_struct *);
+ 
+ /*
+  * base.c
+diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
+index 9b15a4bcfa77..1f720d3090f7 100644
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -30,6 +30,7 @@
+ #include <linux/vmstat.h>
+ #include <linux/writeback.h>
+ #include <linux/page-flags.h>
++#include <linux/memdelay.h>
+ 
+ struct mem_cgroup;
+ struct page;
+@@ -183,6 +184,9 @@ struct mem_cgroup {
+ 
+ 	unsigned long soft_limit;
+ 
++	/* Memory delay measurement domain */
++	struct memdelay_domain *memdelay_domain;
++
+ 	/* vmpressure notifications */
+ 	struct vmpressure vmpressure;
+ 
+@@ -728,6 +732,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
+ 	return &pgdat->lruvec;
+ }
+ 
++static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
++{
++	return NULL;
++}
++
+ static inline bool mm_match_cgroup(struct mm_struct *mm,
+ 		struct mem_cgroup *memcg)
+ {
+@@ -740,6 +749,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
+ 	return true;
+ }
+ 
++static inline struct mem_cgroup *mem_cgroup_from_task(struct task_struct *task)
++{
++	return NULL;
++}
++
+ static inline struct mem_cgroup *
+ mem_cgroup_iter(struct mem_cgroup *root,
+ 		struct mem_cgroup *prev,
+diff --git a/include/linux/memdelay.h b/include/linux/memdelay.h
+new file mode 100644
+index 000000000000..08ed4e4baedf
+--- /dev/null
++++ b/include/linux/memdelay.h
+@@ -0,0 +1,182 @@
++#ifndef _LINUX_MEMDELAY_H
++#define _LINUX_MEMDELAY_H
++
++#include <linux/spinlock_types.h>
++#include <linux/sched.h>
++
++struct seq_file;
++struct css_set;
++
++/*
++ * Task productivity states tracked by the scheduler
++ */
++enum memdelay_task_state {
++	MTS_NONE,		/* Idle/unqueued/untracked */
++	MTS_IOWAIT,		/* Waiting for IO, not memory delayed */
++	MTS_RUNNABLE,		/* On the runqueue, not memory delayed */
++	MTS_DELAYED,		/* Memory delayed, not running */
++	MTS_DELAYED_ACTIVE,	/* Memory delayed, actively running */
++	NR_MEMDELAY_TASK_STATES,
++};
++
++/*
++ * System/cgroup delay state tracked by the VM, composed of the
++ * productivity states of all tasks inside the domain.
++ */
++enum memdelay_domain_state {
++	MDS_NONE,		/* No delayed tasks */
++	MDS_SOME,		/* Delayed tasks, working tasks */
++	MDS_FULL,		/* Delayed tasks, no working tasks */
++	NR_MEMDELAY_DOMAIN_STATES,
++};
++
++struct memdelay_domain_cpu {
++	/* Task states of the domain on this CPU */
++	int tasks[NR_MEMDELAY_TASK_STATES];
++
++	/* Delay state of the domain on this CPU */
++	enum memdelay_domain_state state;
++
++	/* Time of last state change */
++	u64 state_start;
++};
++
++struct memdelay_domain {
++	/* Aggregate delayed time of all domain tasks */
++	unsigned long aggregate;
++
++	/* Per-CPU delay states in the domain */
++	struct memdelay_domain_cpu __percpu *mdcs;
++
++	/* Cumulative state times from all CPUs */
++	unsigned long times[NR_MEMDELAY_DOMAIN_STATES];
++
++	/* Decaying state time averages over 1m, 5m, 15m */
++	unsigned long period_expires;
++	unsigned long avg_full[3];
++	unsigned long avg_some[3];
++};
++
++/* mm/memdelay.c */
++extern struct memdelay_domain memdelay_global_domain;
++void memdelay_init(void);
++void memdelay_task_change(struct task_struct *task,
++			  enum memdelay_task_state old,
++			  enum memdelay_task_state new);
++struct memdelay_domain *memdelay_domain_alloc(void);
++void memdelay_domain_free(struct memdelay_domain *md);
++int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md);
++
++/* kernel/sched/memdelay.c */
++void memdelay_enter(unsigned long *flags);
++void memdelay_leave(unsigned long *flags);
++
++/**
++ * memdelay_schedule - note a context switch
++ * @prev: task scheduling out
++ * @next: task scheduling in
++ *
++ * A task switch doesn't affect the balance between delayed and
++ * productive tasks, but we have to update whether the delay is
++ * actively using the CPU or not.
++ */
++static inline void memdelay_schedule(struct task_struct *prev,
++				     struct task_struct *next)
++{
++	if (prev->flags & PF_MEMDELAY)
++		memdelay_task_change(prev, MTS_DELAYED_ACTIVE, MTS_DELAYED);
++
++	if (next->flags & PF_MEMDELAY)
++		memdelay_task_change(next, MTS_DELAYED, MTS_DELAYED_ACTIVE);
++}
++
++/**
++ * memdelay_wakeup - note a task waking up
++ * @task: the task
++ *
++ * Notes an idle task becoming productive. Delayed tasks remain
++ * delayed even when they become runnable.
++ */
++static inline void memdelay_wakeup(struct task_struct *task)
++{
++	if (task->flags & PF_MEMDELAY)
++		return;
++
++	if (task->in_iowait)
++		memdelay_task_change(task, MTS_IOWAIT, MTS_RUNNABLE);
++	else
++		memdelay_task_change(task, MTS_NONE, MTS_RUNNABLE);
++}
++
++/**
++ * memdelay_wakeup - note a task going to sleep
++ * @task: the task
++ *
++ * Notes a working tasks becoming unproductive. Delayed tasks remain
++ * delayed.
++ */
++static inline void memdelay_sleep(struct task_struct *task)
++{
++	if (task->flags & PF_MEMDELAY)
++		return;
++
++	if (task->in_iowait)
++		memdelay_task_change(task, MTS_RUNNABLE, MTS_IOWAIT);
++	else
++		memdelay_task_change(task, MTS_RUNNABLE, MTS_NONE);
++}
++
++/**
++ * memdelay_del_add - track task movement between runqueues
++ * @task: the task
++ * @runnable: a runnable task is moved if %true, unqueued otherwise
++ * @add: task is being added if %true, removed otherwise
++ *
++ * Update the memdelay domain per-cpu states as tasks are being moved
++ * around the runqueues.
++ */
++static inline void memdelay_del_add(struct task_struct *task,
++				    bool runnable, bool add)
++{
++	int state;
++
++	if (task->flags & PF_MEMDELAY)
++		state = MTS_DELAYED;
++	else if (runnable)
++		state = MTS_RUNNABLE;
++	else if (task->in_iowait)
++		state = MTS_IOWAIT;
++	else
++		return; /* already MTS_NONE */
++
++	if (add)
++		memdelay_task_change(task, MTS_NONE, state);
++	else
++		memdelay_task_change(task, state, MTS_NONE);
++}
++
++static inline void memdelay_del_runnable(struct task_struct *task)
++{
++	memdelay_del_add(task, true, false);
++}
++
++static inline void memdelay_add_runnable(struct task_struct *task)
++{
++	memdelay_del_add(task, true, true);
++}
++
++static inline void memdelay_del_sleeping(struct task_struct *task)
++{
++	memdelay_del_add(task, false, false);
++}
++
++static inline void memdelay_add_sleeping(struct task_struct *task)
++{
++	memdelay_del_add(task, false, true);
++}
++
++#ifdef CONFIG_CGROUPS
++void cgroup_move_task(struct task_struct *task, struct css_set *to);
++#endif
++
++#endif /* _LINUX_MEMDELAY_H */
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index c05ac5f5aa03..de15e3c8c43a 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -651,6 +651,7 @@ struct task_struct {
+ 	/* disallow userland-initiated cgroup migration */
+ 	unsigned			no_cgroup_migration:1;
+ #endif
++	unsigned			memdelay_migrate_enqueue:1;
+ 
+ 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
+ 
+@@ -871,6 +872,12 @@ struct task_struct {
+ 
+ 	struct io_context		*io_context;
+ 
++	u64				memdelay_start;
++	unsigned long			memdelay_total;
++#ifdef CONFIG_DEBUG_VM
++	int				memdelay_state;
++#endif
++
+ 	/* Ptrace state: */
+ 	unsigned long			ptrace_message;
+ 	siginfo_t			*last_siginfo;
+@@ -1274,6 +1281,7 @@ extern struct pid *cad_pid;
+ #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
+ #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
+ #define PF_SWAPWRITE		0x00800000	/* Allowed to write to swap */
++#define PF_MEMDELAY		0x01000000	/* Delayed due to lack of memory */
+ #define PF_NO_SETAFFINITY	0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
+ #define PF_MCE_EARLY		0x08000000      /* Early kill for mce process policy */
+ #define PF_MUTEX_TESTER		0x20000000	/* Thread belongs to the rt mutex tester */
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index df2e0f14a95d..930aaef50396 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -699,7 +699,8 @@ static void css_set_move_task(struct task_struct *task,
+ 		 */
+ 		WARN_ON_ONCE(task->flags & PF_EXITING);
+ 
+-		rcu_assign_pointer(task->cgroups, to_cset);
++		cgroup_move_task(task, to_cset);
++
+ 		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+ 							     &to_cset->tasks);
+ 	}
+diff --git a/kernel/fork.c b/kernel/fork.c
+index b7e9e57b71ea..96dd35393be9 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1208,6 +1208,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+ 	int retval;
+ 
+ 	tsk->min_flt = tsk->maj_flt = 0;
++	tsk->memdelay_total = 0;
++#ifdef CONFIG_DEBUG_VM
++	tsk->memdelay_state = 0;
++#endif
+ 	tsk->nvcsw = tsk->nivcsw = 0;
+ #ifdef CONFIG_DETECT_HUNG_TASK
+ 	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
+index 53f0164ed362..84390fc42f60 100644
+--- a/kernel/sched/Makefile
++++ b/kernel/sched/Makefile
+@@ -17,7 +17,7 @@ endif
+ 
+ obj-y += core.o loadavg.o clock.o cputime.o
+ obj-y += idle_task.o fair.o rt.o deadline.o
+-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
++obj-y += wait.o wait_bit.o swait.o completion.o idle.o memdelay.o
+ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
+ obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
+ obj-$(CONFIG_SCHEDSTATS) += stats.o
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 0869b20fba81..bf105c870da6 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -26,6 +26,7 @@
+ #include <linux/profile.h>
+ #include <linux/security.h>
+ #include <linux/syscalls.h>
++#include <linux/memdelay.h>
+ 
+ #include <asm/switch_to.h>
+ #include <asm/tlb.h>
+@@ -759,6 +760,14 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+ 	if (!(flags & ENQUEUE_RESTORE))
+ 		sched_info_queued(rq, p);
+ 
++	WARN_ON_ONCE(!(flags & ENQUEUE_WAKEUP) && p->memdelay_migrate_enqueue);
++	if (!(flags & ENQUEUE_WAKEUP) || p->memdelay_migrate_enqueue) {
++		memdelay_add_runnable(p);
++		p->memdelay_migrate_enqueue = 0;
++	} else {
++		memdelay_wakeup(p);
++	}
++
+ 	p->sched_class->enqueue_task(rq, p, flags);
+ }
+ 
+@@ -770,6 +779,11 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+ 	if (!(flags & DEQUEUE_SAVE))
+ 		sched_info_dequeued(rq, p);
+ 
++	if (!(flags & DEQUEUE_SLEEP))
++		memdelay_del_runnable(p);
++	else
++		memdelay_sleep(p);
++
+ 	p->sched_class->dequeue_task(rq, p, flags);
+ }
+ 
+@@ -2044,7 +2058,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+ 
+ 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ 	if (task_cpu(p) != cpu) {
++		struct rq_flags rf;
++		struct rq *rq;
++
+ 		wake_flags |= WF_MIGRATED;
++
++		rq = __task_rq_lock(p, &rf);
++		memdelay_del_sleeping(p);
++		__task_rq_unlock(rq, &rf);
++		p->memdelay_migrate_enqueue = 1;
++
+ 		set_task_cpu(p, cpu);
+ 	}
+ 
+@@ -3326,6 +3349,8 @@ static void __sched notrace __schedule(bool preempt)
+ 		rq->curr = next;
+ 		++*switch_count;
+ 
++		memdelay_schedule(prev, next);
++
+ 		trace_sched_switch(preempt, prev, next);
+ 
+ 		/* Also unlocks the rq: */
+@@ -5919,6 +5944,8 @@ void __init sched_init(void)
+ 
+ 	init_schedstats();
+ 
++	memdelay_init();
++
+ 	scheduler_running = 1;
+ }
+ 
+diff --git a/kernel/sched/memdelay.c b/kernel/sched/memdelay.c
+new file mode 100644
+index 000000000000..1d4813cd018a
+--- /dev/null
++++ b/kernel/sched/memdelay.c
+@@ -0,0 +1,118 @@
++/*
++ * Memory delay metric
++ *
++ * Copyright (c) 2017 Facebook, Johannes Weiner
++ *
++ * This code quantifies and reports to userspace the wall-time impact
++ * of memory pressure on the system and memory-controlled cgroups.
++ */
++
++#include <linux/memdelay.h>
++#include <linux/cgroup.h>
++#include <linux/sched.h>
++
++#include "sched.h"
++
++/**
++ * memdelay_enter - mark the beginning of a memory delay section
++ * @flags: flags to handle nested memdelay sections
++ *
++ * Marks the calling task as being delayed due to a lack of memory,
++ * such as waiting for a workingset refault or performing reclaim.
++ */
++void memdelay_enter(unsigned long *flags)
++{
++	struct rq_flags rf;
++	struct rq *rq;
++
++	*flags = current->flags & PF_MEMDELAY;
++	if (*flags)
++		return;
++	/*
++	 * PF_MEMDELAY & accounting needs to be atomic wrt changes to
++	 * the task's scheduling state and its domain association.
++	 * Otherwise we could race with CPU or cgroup migration and
++	 * misaccount.
++	 */
++	local_irq_disable();
++	rq = this_rq();
++	rq_lock(rq, &rf);
++
++	current->flags |= PF_MEMDELAY;
++	memdelay_task_change(current, MTS_RUNNABLE, MTS_DELAYED_ACTIVE);
++
++	rq_unlock(rq, &rf);
++	local_irq_enable();
++}
++
++/**
++ * memdelay_leave - mark the end of a memory delay section
++ * @flags: flags to handle nested memdelay sections
++ *
++ * Marks the calling task as no longer delayed due to memory.
++ */
++void memdelay_leave(unsigned long *flags)
++{
++	struct rq_flags rf;
++	struct rq *rq;
++
++	if (*flags)
++		return;
++	/*
++	 * PF_MEMDELAY & accounting needs to be atomic wrt changes to
++	 * the task's scheduling state and its domain association.
++	 * Otherwise we could race with CPU or cgroup migration and
++	 * misaccount.
++	 */
++	local_irq_disable();
++	rq = this_rq();
++	rq_lock(rq, &rf);
++
++	current->flags &= ~PF_MEMDELAY;
++	memdelay_task_change(current, MTS_DELAYED_ACTIVE, MTS_RUNNABLE);
++
++	rq_unlock(rq, &rf);
++	local_irq_enable();
++}
++
++#ifdef CONFIG_CGROUPS
++/**
++ * cgroup_move_task - move task to a different cgroup
++ * @task: the task
++ * @to: the target css_set
++ *
++ * Move task to a new cgroup and safely migrate its associated
++ * delayed/working state between the different domains.
++ *
++ * This function acquires the task's rq lock to lock out concurrent
++ * changes to the task's scheduling state and - in case the task is
++ * running - concurrent changes to its delay state.
++ */
++void cgroup_move_task(struct task_struct *task, struct css_set *to)
++{
++	struct rq_flags rf;
++	struct rq *rq;
++	int state;
++
++	rq = task_rq_lock(task, &rf);
++
++	if (task->flags & PF_MEMDELAY)
++		state = MTS_DELAYED + task_current(rq, task);
++	else if (task_on_rq_queued(task))
++		state = MTS_RUNNABLE;
++	else if (task->in_iowait)
++		state = MTS_IOWAIT;
++	else
++		state = MTS_NONE;
++
++	/*
++	 * Lame to do this here, but the scheduler cannot be locked
++	 * from the outside, so we move cgroups from inside sched/.
++	 */
++	memdelay_task_change(task, state, MTS_NONE);
++	rcu_assign_pointer(task->cgroups, to);
++	memdelay_task_change(task, MTS_NONE, state);
++
++	task_rq_unlock(rq, task, &rf);
++}
++#endif /* CONFIG_CGROUPS */
+diff --git a/mm/Makefile b/mm/Makefile
+index 411bd24d4a7c..c9bdbc5627e5 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -39,7 +39,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
+ 			   mm_init.o mmu_context.o percpu.o slab_common.o \
+ 			   compaction.o vmacache.o swap_slots.o \
+ 			   interval_tree.o list_lru.o workingset.o \
+-			   debug.o $(mmu-y)
++			   memdelay.o debug.o $(mmu-y)
+ 
+ obj-y += init-mm.o
+ 
+diff --git a/mm/compaction.c b/mm/compaction.c
+index fb548e4c7bd4..adf67de23fee 100644
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -2040,11 +2040,15 @@ static int kcompactd(void *p)
+ 	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ 
+ 	while (!kthread_should_stop()) {
++		unsigned long mdflags;
++
+ 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ 		wait_event_freezable(pgdat->kcompactd_wait,
+ 				kcompactd_work_requested(pgdat));
+ 
++		memdelay_enter(&mdflags);
+ 		kcompactd_do_work(pgdat);
++		memdelay_leave(&mdflags);
+ 	}
+ 
+ 	return 0;
+diff --git a/mm/filemap.c b/mm/filemap.c
+index da55a5693da9..648418694405 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -36,6 +36,7 @@
+ #include <linux/memcontrol.h>
+ #include <linux/cleancache.h>
+ #include <linux/rmap.h>
++#include <linux/memdelay.h>
+ #include "internal.h"
+ 
+ #define CREATE_TRACE_POINTS
+@@ -961,8 +962,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+ {
+ 	struct wait_page_queue wait_page;
+ 	wait_queue_entry_t *wait = &wait_page.wait;
++	unsigned long mdflags;
++	bool refault = false;
+ 	int ret = 0;
+ 
++	if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) {
++		memdelay_enter(&mdflags);
++		refault = true;
++	}
++
+ 	init_wait(wait);
+ 	wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
+ 	wait->func = wake_page_function;
+@@ -1001,6 +1009,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+ 
+ 	finish_wait(q, wait);
+ 
++	if (refault)
++		memdelay_leave(&mdflags);
++
+ 	/*
+ 	 * A signal could leave PageWaiters set. Clearing it here if
+ 	 * !waitqueue_active would be possible (by open-coding finish_wait),
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index 93b2eb063afd..102f0f4d3f5c 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -65,6 +65,7 @@
+ #include <linux/lockdep.h>
+ #include <linux/file.h>
+ #include <linux/tracehook.h>
++#include <linux/memdelay.h>
+ #include "internal.h"
+ #include <net/sock.h>
+ #include <net/ip.h>
+@@ -3926,6 +3927,8 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+ 	return ret;
+ }
+ 
++static int memory_memdelay_show(struct seq_file *m, void *v);
++
+ static struct cftype mem_cgroup_legacy_files[] = {
+ 	{
+ 		.name = "usage_in_bytes",
+@@ -3993,6 +3996,10 @@ static struct cftype mem_cgroup_legacy_files[] = {
+ 	{
+ 		.name = "pressure_level",
+ 	},
++	{
++		.name = "memdelay",
++		.seq_show = memory_memdelay_show,
++	},
+ #ifdef CONFIG_NUMA
+ 	{
+ 		.name = "numa_stat",
+@@ -4170,6 +4177,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
+ 
+ 	for_each_node(node)
+ 		free_mem_cgroup_per_node_info(memcg, node);
++	memdelay_domain_free(memcg->memdelay_domain);
+ 	free_percpu(memcg->stat);
+ 	kfree(memcg);
+ }
+@@ -4275,10 +4283,15 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+ 
+ 	/* The following stuff does not apply to the root */
+ 	if (!parent) {
++		memcg->memdelay_domain = &memdelay_global_domain;
+ 		root_mem_cgroup = memcg;
+ 		return &memcg->css;
+ 	}
+ 
++	memcg->memdelay_domain = memdelay_domain_alloc();
++	if (!memcg->memdelay_domain)
++		goto fail;
++
+ 	error = memcg_online_kmem(memcg);
+ 	if (error)
+ 		goto fail;
+@@ -5282,6 +5295,13 @@ static int memory_stat_show(struct seq_file *m, void *v)
+ 	return 0;
+ }
+ 
++static int memory_memdelay_show(struct seq_file *m, void *v)
++{
++	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
++
++	return memdelay_domain_show(m, memcg->memdelay_domain);
++}
++
+ static struct cftype memory_files[] = {
+ 	{
+ 		.name = "current",
+@@ -5317,6 +5337,11 @@ static struct cftype memory_files[] = {
+ 		.flags = CFTYPE_NOT_ON_ROOT,
+ 		.seq_show = memory_stat_show,
+ 	},
++	{
++		.name = "memdelay",
++		.flags = CFTYPE_NOT_ON_ROOT,
++		.seq_show = memory_memdelay_show,
++	},
+ 	{ }	/* terminate */
+ };
+ 
+diff --git a/mm/memdelay.c b/mm/memdelay.c
+new file mode 100644
+index 000000000000..c43d6f7ba22a
+--- /dev/null
++++ b/mm/memdelay.c
+@@ -0,0 +1,285 @@
++/*
++ * Memory delay metric
++ *
++ * Copyright (c) 2017 Facebook, Johannes Weiner
++ *
++ * This code quantifies and reports to userspace the wall-time impact
++ * of memory pressure on the system and memory-controlled cgroups.
++ */
++
++#include <linux/sched/loadavg.h>
++#include <linux/sched/clock.h>
++#include <linux/memcontrol.h>
++#include <linux/memdelay.h>
++#include <linux/seq_file.h>
++#include <linux/proc_fs.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++
++static DEFINE_PER_CPU(struct memdelay_domain_cpu, global_domain_cpus);
++
++/* System-level keeping of memory delay statistics */
++struct memdelay_domain memdelay_global_domain = {
++	.mdcs = &global_domain_cpus,
++};
++
++static void domain_init(struct memdelay_domain *md)
++{
++	md->period_expires = jiffies + LOAD_FREQ;
++}
++
++/**
++ * memdelay_init - initialize the memdelay subsystem
++ *
++ * This needs to run before the scheduler starts queuing and
++ * scheduling tasks.
++ */
++void __init memdelay_init(void)
++{
++	domain_init(&memdelay_global_domain);
++}
++
++static void domain_move_clock(struct memdelay_domain *md)
++{
++	unsigned long expires = READ_ONCE(md->period_expires);
++	unsigned long none, some, full;
++	int missed_periods;
++	unsigned long next;
++	int i;
++
++	if (time_before(jiffies, expires))
++		return;
++
++	missed_periods = 1 + (jiffies - expires) / LOAD_FREQ;
++	next = expires + (missed_periods * LOAD_FREQ);
++
++	if (cmpxchg(&md->period_expires, expires, next) != expires)
++		return;
++
++	none = xchg(&md->times[MDS_NONE], 0);
++	some = xchg(&md->times[MDS_SOME], 0);
++	full = xchg(&md->times[MDS_FULL], 0);
++
++	for (i = 0; i < missed_periods; i++) {
++		unsigned long pct;
++
++		pct = some * 100 / max(none + some + full, 1UL);
++		pct *= FIXED_1;
++		CALC_LOAD(md->avg_some[0], EXP_1, pct);
++		CALC_LOAD(md->avg_some[1], EXP_5, pct);
++		CALC_LOAD(md->avg_some[2], EXP_15, pct);
++
++		pct = full * 100 / max(none + some + full, 1UL);
++		pct *= FIXED_1;
++		CALC_LOAD(md->avg_full[0], EXP_1, pct);
++		CALC_LOAD(md->avg_full[1], EXP_5, pct);
++		CALC_LOAD(md->avg_full[2], EXP_15, pct);
++
++		none = some = full = 0;
++	}
++}
++
++static void domain_cpu_update(struct memdelay_domain *md, int cpu,
++			      enum memdelay_task_state old,
++			      enum memdelay_task_state new)
++{
++	enum memdelay_domain_state state;
++	struct memdelay_domain_cpu *mdc;
++	unsigned long delta;
++	u64 now;
++
++	mdc = per_cpu_ptr(md->mdcs, cpu);
++
++	if (old) {
++		WARN_ONCE(!mdc->tasks[old], "cpu=%d old=%d new=%d counter=%d\n",
++			  cpu, old, new, mdc->tasks[old]);
++		mdc->tasks[old] -= 1;
++	}
++	if (new)
++		mdc->tasks[new] += 1;
++
++	/*
++	 * The domain is somewhat delayed when a number of tasks are
++	 * delayed but there are still others running the workload.
++	 *
++	 * The domain is fully delayed when all non-idle tasks on the
++	 * CPU are delayed, or when a delayed task is actively running
++	 * and preventing productive tasks from making headway.
++	 *
++	 * The state times then add up over all CPUs in the domain: if
++	 * the domain is fully blocked on one CPU and there is another
++	 * one running the workload, the domain is considered fully
++	 * blocked 50% of the time.
++	 */
++	if (mdc->tasks[MTS_DELAYED_ACTIVE] && !mdc->tasks[MTS_IOWAIT])
++		state = MDS_FULL;
++	else if (mdc->tasks[MTS_DELAYED])
++		state = (mdc->tasks[MTS_RUNNABLE] || mdc->tasks[MTS_IOWAIT]) ?
++			MDS_SOME : MDS_FULL;
++	else
++		state = MDS_NONE;
++
++	if (mdc->state == state)
++		return;
++
++	now = cpu_clock(cpu);
++	delta = (now - mdc->state_start) / NSEC_PER_USEC;
++
++	domain_move_clock(md);
++	md->times[mdc->state] += delta;
++
++	mdc->state = state;
++	mdc->state_start = now;
++}
++
++static struct memdelay_domain *memcg_domain(struct mem_cgroup *memcg)
++{
++#ifdef CONFIG_MEMCG
++	if (!mem_cgroup_disabled())
++		return memcg->memdelay_domain;
++#endif
++	return &memdelay_global_domain;
++}
++
++/**
++ * memdelay_task_change - note a task changing its delay/work state
++ * @task: the task changing state
++ * @old: old task state
++ * @new: new task state
++ *
++ * Updates the task's domain counters to reflect a change in the
++ * task's delayed/working state.
++ */
++void memdelay_task_change(struct task_struct *task,
++			  enum memdelay_task_state old,
++			  enum memdelay_task_state new)
++{
++	int cpu = task_cpu(task);
++	struct mem_cgroup *memcg;
++	unsigned long delay = 0;
++
++#ifdef CONFIG_DEBUG_VM
++	WARN_ONCE(task->memdelay_state != old,
++		  "cpu=%d task=%p state=%d (in_iowait=%d PF_MEMDELAYED=%d) old=%d new=%d\n",
++		  cpu, task, task->memdelay_state, task->in_iowait,
++		  !!(task->flags & PF_MEMDELAY), old, new);
++	task->memdelay_state = new;
++#endif
++
++	/* Account when tasks are entering and leaving delays */
++	if (old < MTS_DELAYED && new >= MTS_DELAYED) {
++		task->memdelay_start = cpu_clock(cpu);
++	} else if (old >= MTS_DELAYED && new < MTS_DELAYED) {
++		delay = (cpu_clock(cpu) - task->memdelay_start) / NSEC_PER_USEC;
++		task->memdelay_total += delay;
++	}
++
++	/* Account domain state changes */
++	rcu_read_lock();
++	memcg = mem_cgroup_from_task(task);
++	do {
++		struct memdelay_domain *md;
++
++		md = memcg_domain(memcg);
++		md->aggregate += delay;
++		domain_cpu_update(md, cpu, old, new);
++	} while (memcg && (memcg = parent_mem_cgroup(memcg)));
++	rcu_read_unlock();
++};
++
++/**
++ * memdelay_domain_alloc - allocate a cgroup memory delay domain
++ */
++struct memdelay_domain *memdelay_domain_alloc(void)
++{
++	struct memdelay_domain *md;
++
++	md = kzalloc(sizeof(*md), GFP_KERNEL);
++	if (!md)
++		return NULL;
++	md->mdcs = alloc_percpu(struct memdelay_domain_cpu);
++	if (!md->mdcs) {
++		kfree(md);
++		return NULL;
++	}
++	domain_init(md);
++	return md;
++}
++
++/**
++ * memdelay_domain_free - free a cgroup memory delay domain
++ */
++void memdelay_domain_free(struct memdelay_domain *md)
++{
++	if (md) {
++		free_percpu(md->mdcs);
++		kfree(md);
++	}
++}
++
++/**
++ * memdelay_domain_show - format memory delay domain stats to a seq_file
++ * @s: the seq_file
++ * @md: the memory domain
++ */
++int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md)
++{
++	domain_move_clock(md);
++
++	seq_printf(s, "%lu\n", md->aggregate);
++
++	seq_printf(s, "%lu.%02lu %lu.%02lu %lu.%02lu\n",
++		   LOAD_INT(md->avg_some[0]), LOAD_FRAC(md->avg_some[0]),
++		   LOAD_INT(md->avg_some[1]), LOAD_FRAC(md->avg_some[1]),
++		   LOAD_INT(md->avg_some[2]), LOAD_FRAC(md->avg_some[2]));
++
++	seq_printf(s, "%lu.%02lu %lu.%02lu %lu.%02lu\n",
++		   LOAD_INT(md->avg_full[0]), LOAD_FRAC(md->avg_full[0]),
++		   LOAD_INT(md->avg_full[1]), LOAD_FRAC(md->avg_full[1]),
++		   LOAD_INT(md->avg_full[2]), LOAD_FRAC(md->avg_full[2]));
++
++#ifdef CONFIG_DEBUG_VM
++	{
++		int cpu;
++
++		for_each_online_cpu(cpu) {
++			struct memdelay_domain_cpu *mdc;
++
++			mdc = per_cpu_ptr(md->mdcs, cpu);
++			seq_printf(s, "%d %d %d %d\n",
++				   mdc->tasks[MTS_IOWAIT],
++				   mdc->tasks[MTS_RUNNABLE],
++				   mdc->tasks[MTS_DELAYED],
++				   mdc->tasks[MTS_DELAYED_ACTIVE]);
++		}
++	}
++#endif
++
++	return 0;
++}
++
++static int memdelay_show(struct seq_file *m, void *v)
++{
++	return memdelay_domain_show(m, &memdelay_global_domain);
++}
++
++static int memdelay_open(struct inode *inode, struct file *file)
++{
++	return single_open(file, memdelay_show, NULL);
++}
++
++static const struct file_operations memdelay_fops = {
++	.open           = memdelay_open,
++	.read           = seq_read,
++	.llseek         = seq_lseek,
++	.release        = single_release,
++};
++
++static int __init memdelay_proc_init(void)
++{
++	proc_create("memdelay", 0, NULL, &memdelay_fops);
++	return 0;
++}
++module_init(memdelay_proc_init);
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 1423da8dd16f..d8d01e9df982 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -67,6 +67,7 @@
+ #include <linux/memcontrol.h>
+ #include <linux/ftrace.h>
+ #include <linux/nmi.h>
++#include <linux/memdelay.h>
+ 
+ #include <asm/sections.h>
+ #include <asm/tlbflush.h>
+@@ -3364,16 +3365,19 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+ 		unsigned int alloc_flags, const struct alloc_context *ac,
+ 		enum compact_priority prio, enum compact_result *compact_result)
+ {
+-	struct page *page;
+ 	unsigned int noreclaim_flag;
++	unsigned long mdflags;
++	struct page *page;
+ 
+ 	if (!order)
+ 		return NULL;
+ 
++	memdelay_enter(&mdflags);
+ 	noreclaim_flag = memalloc_noreclaim_save();
+ 	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ 									prio);
+ 	memalloc_noreclaim_restore(noreclaim_flag);
++	memdelay_leave(&mdflags);
+ 
+ 	if (*compact_result <= COMPACT_INACTIVE)
+ 		return NULL;
+@@ -3519,13 +3523,15 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
+ 					const struct alloc_context *ac)
+ {
+ 	struct reclaim_state reclaim_state;
+-	int progress;
+ 	unsigned int noreclaim_flag;
++	unsigned long mdflags;
++	int progress;
+ 
+ 	cond_resched();
+ 
+ 	/* We now go into synchronous reclaim */
+ 	cpuset_memory_pressure_bump();
++	memdelay_enter(&mdflags);
+ 	noreclaim_flag = memalloc_noreclaim_save();
+ 	lockdep_set_current_reclaim_state(gfp_mask);
+ 	reclaim_state.reclaimed_slab = 0;
+@@ -3537,6 +3543,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
+ 	current->reclaim_state = NULL;
+ 	lockdep_clear_current_reclaim_state();
+ 	memalloc_noreclaim_restore(noreclaim_flag);
++	memdelay_leave(&mdflags);
+ 
+ 	cond_resched();
+ 
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 60357cd84c67..1029305b9b3a 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -48,6 +48,7 @@
+ #include <linux/prefetch.h>
+ #include <linux/printk.h>
+ #include <linux/dax.h>
++#include <linux/memdelay.h>
+ 
+ #include <asm/tlbflush.h>
+ #include <asm/div64.h>
+@@ -3098,6 +3099,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ {
+ 	struct zonelist *zonelist;
+ 	unsigned long nr_reclaimed;
++	unsigned long mdflags;
+ 	int nid;
+ 	unsigned int noreclaim_flag;
+ 	struct scan_control sc = {
+@@ -3126,9 +3128,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ 					    sc.gfp_mask,
+ 					    sc.reclaim_idx);
+ 
++	memdelay_enter(&mdflags);
+ 	noreclaim_flag = memalloc_noreclaim_save();
+ 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ 	memalloc_noreclaim_restore(noreclaim_flag);
++	memdelay_leave(&mdflags);
+ 
+ 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ 
+@@ -3550,6 +3554,7 @@ static int kswapd(void *p)
+ 	pgdat->kswapd_order = 0;
+ 	pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
+ 	for ( ; ; ) {
++		unsigned long mdflags;
+ 		bool ret;
+ 
+ 		alloc_order = reclaim_order = pgdat->kswapd_order;
+@@ -3586,7 +3591,11 @@ static int kswapd(void *p)
+ 		 */
+ 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+ 						alloc_order);
++
++		memdelay_enter(&mdflags);
+ 		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
++		memdelay_leave(&mdflags);
++
+ 		if (reclaim_order < alloc_order)
+ 			goto kswapd_try_sleep;
+ 	}
+-- 
+2.14.1
diff --git a/a/content_digest b/N1/content_digest
index da1f80c..3e678d4 100644
--- a/a/content_digest
+++ b/N1/content_digest
@@ -45,5 +45,1741 @@
  "\01:2\0"
  "fn\00001-sched-loadavg-consolidate-LOAD_INT-LOAD_FRAC-macros.patch\0"
  "b\0"
+ ">From d5ffeb4d9d65fcff1b7e50dbde8264b4c32824a5 Mon Sep 17 00:00:00 2001\n"
+ "From: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "Date: Wed, 14 Jun 2017 11:12:05 -0400\n"
+ "Subject: [PATCH 1/3] sched/loadavg: consolidate LOAD_INT, LOAD_FRAC macros\n"
+ "\n"
+ "There are several identical definitions of those macros in places that\n"
+ "mess with fixed-point load averages. Provide an official version.\n"
+ "\n"
+ "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "---\n"
+ " arch/powerpc/platforms/cell/spufs/sched.c | 3 ---\n"
+ " arch/s390/appldata/appldata_os.c          | 4 ----\n"
+ " drivers/cpuidle/governors/menu.c          | 4 ----\n"
+ " fs/proc/loadavg.c                         | 3 ---\n"
+ " include/linux/sched/loadavg.h             | 3 +++\n"
+ " kernel/debug/kdb/kdb_main.c               | 7 +------\n"
+ " 6 files changed, 4 insertions(+), 20 deletions(-)\n"
+ "\n"
+ "diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c\n"
+ "index 1fbb5da17dd2..de544070def3 100644\n"
+ "--- a/arch/powerpc/platforms/cell/spufs/sched.c\n"
+ "+++ b/arch/powerpc/platforms/cell/spufs/sched.c\n"
+ "@@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx,\n"
+ " \t}\n"
+ " }\n"
+ " \n"
+ "-#define LOAD_INT(x) ((x) >> FSHIFT)\n"
+ "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n"
+ "-\n"
+ " static int show_spu_loadavg(struct seq_file *s, void *private)\n"
+ " {\n"
+ " \tint a, b, c;\n"
+ "diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c\n"
+ "index 45b3178200ab..a8aac17e1e82 100644\n"
+ "--- a/arch/s390/appldata/appldata_os.c\n"
+ "+++ b/arch/s390/appldata/appldata_os.c\n"
+ "@@ -24,10 +24,6 @@\n"
+ " \n"
+ " #include \"appldata.h\"\n"
+ " \n"
+ "-\n"
+ "-#define LOAD_INT(x) ((x) >> FSHIFT)\n"
+ "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n"
+ "-\n"
+ " /*\n"
+ "  * OS data\n"
+ "  *\n"
+ "diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c\n"
+ "index 61b64c2b2cb8..e215a2c10a61 100644\n"
+ "--- a/drivers/cpuidle/governors/menu.c\n"
+ "+++ b/drivers/cpuidle/governors/menu.c\n"
+ "@@ -132,10 +132,6 @@ struct menu_device {\n"
+ " \tint\t\tinterval_ptr;\n"
+ " };\n"
+ " \n"
+ "-\n"
+ "-#define LOAD_INT(x) ((x) >> FSHIFT)\n"
+ "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n"
+ "-\n"
+ " static inline int get_loadavg(unsigned long load)\n"
+ " {\n"
+ " \treturn LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;\n"
+ "diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c\n"
+ "index 983fce5c2418..111a25e4b088 100644\n"
+ "--- a/fs/proc/loadavg.c\n"
+ "+++ b/fs/proc/loadavg.c\n"
+ "@@ -9,9 +9,6 @@\n"
+ " #include <linux/seqlock.h>\n"
+ " #include <linux/time.h>\n"
+ " \n"
+ "-#define LOAD_INT(x) ((x) >> FSHIFT)\n"
+ "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n"
+ "-\n"
+ " static int loadavg_proc_show(struct seq_file *m, void *v)\n"
+ " {\n"
+ " \tunsigned long avnrun[3];\n"
+ "diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h\n"
+ "index 4264bc6b2c27..745483bb5cca 100644\n"
+ "--- a/include/linux/sched/loadavg.h\n"
+ "+++ b/include/linux/sched/loadavg.h\n"
+ "@@ -26,6 +26,9 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);\n"
+ " \tload += n*(FIXED_1-exp); \\\n"
+ " \tload >>= FSHIFT;\n"
+ " \n"
+ "+#define LOAD_INT(x) ((x) >> FSHIFT)\n"
+ "+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n"
+ "+\n"
+ " extern void calc_global_load(unsigned long ticks);\n"
+ " \n"
+ " #endif /* _LINUX_SCHED_LOADAVG_H */\n"
+ "diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c\n"
+ "index c8146d53ca67..2dddd25ccd7a 100644\n"
+ "--- a/kernel/debug/kdb/kdb_main.c\n"
+ "+++ b/kernel/debug/kdb/kdb_main.c\n"
+ "@@ -2571,16 +2571,11 @@ static int kdb_summary(int argc, const char **argv)\n"
+ " \t}\n"
+ " \tkdb_printf(\"%02ld:%02ld\\n\", val.uptime/(60*60), (val.uptime/60)%60);\n"
+ " \n"
+ "-\t/* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */\n"
+ "-\n"
+ "-#define LOAD_INT(x) ((x) >> FSHIFT)\n"
+ "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n"
+ " \tkdb_printf(\"load avg   %ld.%02ld %ld.%02ld %ld.%02ld\\n\",\n"
+ " \t\tLOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),\n"
+ " \t\tLOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),\n"
+ " \t\tLOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));\n"
+ "-#undef LOAD_INT\n"
+ "-#undef LOAD_FRAC\n"
+ "+\n"
+ " \t/* Display in kilobytes */\n"
+ " #define K(x) ((x) << (PAGE_SHIFT - 10))\n"
+ " \tkdb_printf(\"\\nMemTotal:       %8lu kB\\nMemFree:        %8lu kB\\n\"\n"
+ "-- \n"
+ 2.14.1
+ "\01:3\0"
+ "fn\00002-mm-workingset-tell-cache-transitions-from-workingset.patch\0"
+ "b\0"
+ ">From 4ccc6444efbdcc30680eff6b8f345511c306f3d7 Mon Sep 17 00:00:00 2001\n"
+ "From: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "Date: Thu, 2 Mar 2017 09:58:03 -0500\n"
+ "Subject: [PATCH 2/3] mm: workingset: tell cache transitions from workingset\n"
+ " thrashing\n"
+ "\n"
+ "Refaults happen during transitions between workingsets as well as\n"
+ "in-place thrashing. Knowing the difference between the two has a range\n"
+ "of applications, including measuring the impact of memory shortage on\n"
+ "the system performance, as well as the ability to smarter balance\n"
+ "pressure between the filesystem cache and the swap-backed workingset.\n"
+ "\n"
+ "During workingset transitions, inactive cache refaults and pushes out\n"
+ "established active cache. When that active cache isn't stale, however,\n"
+ "and also ends up refaulting, that's bonafide thrashing.\n"
+ "\n"
+ "Introduce a new page flag that tells on eviction whether the page has\n"
+ "been active or not in its lifetime. This bit is then stored in the\n"
+ "shadow entry, to classify refaults as transitioning or thrashing.\n"
+ "\n"
+ "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "---\n"
+ " include/linux/mmzone.h         |  1 +\n"
+ " include/linux/page-flags.h     |  5 ++-\n"
+ " include/linux/swap.h           |  2 +-\n"
+ " include/trace/events/mmflags.h |  1 +\n"
+ " mm/filemap.c                   |  9 ++--\n"
+ " mm/huge_memory.c               |  1 +\n"
+ " mm/memcontrol.c                |  2 +\n"
+ " mm/migrate.c                   |  2 +\n"
+ " mm/swap_state.c                |  1 +\n"
+ " mm/vmscan.c                    |  1 +\n"
+ " mm/vmstat.c                    |  1 +\n"
+ " mm/workingset.c                | 96 +++++++++++++++++++++++++++---------------\n"
+ " 12 files changed, 79 insertions(+), 43 deletions(-)\n"
+ "\n"
+ "diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h\n"
+ "index fc14b8b3f6ce..b8726b501166 100644\n"
+ "--- a/include/linux/mmzone.h\n"
+ "+++ b/include/linux/mmzone.h\n"
+ "@@ -156,6 +156,7 @@ enum node_stat_item {\n"
+ " \tNR_ISOLATED_FILE,\t/* Temporary isolated pages from file lru */\n"
+ " \tWORKINGSET_REFAULT,\n"
+ " \tWORKINGSET_ACTIVATE,\n"
+ "+\tWORKINGSET_RESTORE,\n"
+ " \tWORKINGSET_NODERECLAIM,\n"
+ " \tNR_ANON_MAPPED,\t/* Mapped anonymous pages */\n"
+ " \tNR_FILE_MAPPED,\t/* pagecache pages mapped into pagetables.\n"
+ "diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h\n"
+ "index d33e3280c8ad..f889af1a6aed 100644\n"
+ "--- a/include/linux/page-flags.h\n"
+ "+++ b/include/linux/page-flags.h\n"
+ "@@ -73,13 +73,14 @@\n"
+ "  */\n"
+ " enum pageflags {\n"
+ " \tPG_locked,\t\t/* Page is locked. Don't touch. */\n"
+ "-\tPG_error,\n"
+ " \tPG_referenced,\n"
+ " \tPG_uptodate,\n"
+ " \tPG_dirty,\n"
+ " \tPG_lru,\n"
+ " \tPG_active,\n"
+ "+\tPG_workingset,\n"
+ " \tPG_waiters,\t\t/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as \"PG_locked\" */\n"
+ "+\tPG_error,\n"
+ " \tPG_slab,\n"
+ " \tPG_owner_priv_1,\t/* Owner use. If pagecache, fs may use*/\n"
+ " \tPG_arch_1,\n"
+ "@@ -272,6 +273,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)\n"
+ " PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)\n"
+ " PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)\n"
+ " \tTESTCLEARFLAG(Active, active, PF_HEAD)\n"
+ "+PAGEFLAG(Workingset, workingset, PF_HEAD)\n"
+ "+\tTESTCLEARFLAG(Workingset, workingset, PF_HEAD)\n"
+ " __PAGEFLAG(Slab, slab, PF_NO_TAIL)\n"
+ " __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)\n"
+ " PAGEFLAG(Checked, checked, PF_NO_COMPOUND)\t   /* Used by some filesystems */\n"
+ "diff --git a/include/linux/swap.h b/include/linux/swap.h\n"
+ "index d83d28e53e62..914a173beee1 100644\n"
+ "--- a/include/linux/swap.h\n"
+ "+++ b/include/linux/swap.h\n"
+ "@@ -252,7 +252,7 @@ struct swap_info_struct {\n"
+ " \n"
+ " /* linux/mm/workingset.c */\n"
+ " void *workingset_eviction(struct address_space *mapping, struct page *page);\n"
+ "-bool workingset_refault(void *shadow);\n"
+ "+void workingset_refault(struct page *page, void *shadow);\n"
+ " void workingset_activation(struct page *page);\n"
+ " void workingset_update_node(struct radix_tree_node *node, void *private);\n"
+ " \n"
+ "diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h\n"
+ "index 8e50d01c645f..aac9eb272754 100644\n"
+ "--- a/include/trace/events/mmflags.h\n"
+ "+++ b/include/trace/events/mmflags.h\n"
+ "@@ -90,6 +90,7 @@\n"
+ " \t{1UL << PG_dirty,\t\t\"dirty\"\t\t},\t\t\\\n"
+ " \t{1UL << PG_lru,\t\t\t\"lru\"\t\t},\t\t\\\n"
+ " \t{1UL << PG_active,\t\t\"active\"\t},\t\t\\\n"
+ "+\t{1UL << PG_workingset,\t\t\"workingset\"\t},\t\t\\\n"
+ " \t{1UL << PG_slab,\t\t\"slab\"\t\t},\t\t\\\n"
+ " \t{1UL << PG_owner_priv_1,\t\"owner_priv_1\"\t},\t\t\\\n"
+ " \t{1UL << PG_arch_1,\t\t\"arch_1\"\t},\t\t\\\n"
+ "diff --git a/mm/filemap.c b/mm/filemap.c\n"
+ "index 65b4b6e7f7bd..da55a5693da9 100644\n"
+ "--- a/mm/filemap.c\n"
+ "+++ b/mm/filemap.c\n"
+ "@@ -823,12 +823,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,\n"
+ " \t\t * data from the working set, only to cache data that will\n"
+ " \t\t * get overwritten with something else, is a waste of memory.\n"
+ " \t\t */\n"
+ "-\t\tif (!(gfp_mask & __GFP_WRITE) &&\n"
+ "-\t\t    shadow && workingset_refault(shadow)) {\n"
+ "-\t\t\tSetPageActive(page);\n"
+ "-\t\t\tworkingset_activation(page);\n"
+ "-\t\t} else\n"
+ "-\t\t\tClearPageActive(page);\n"
+ "+\t\tWARN_ON_ONCE(PageActive(page));\n"
+ "+\t\tif (!(gfp_mask & __GFP_WRITE) && shadow)\n"
+ "+\t\t\tworkingset_refault(page, shadow);\n"
+ " \t\tlru_cache_add(page);\n"
+ " \t}\n"
+ " \treturn ret;\n"
+ "diff --git a/mm/huge_memory.c b/mm/huge_memory.c\n"
+ "index 90731e3b7e58..b18ac8084c2a 100644\n"
+ "--- a/mm/huge_memory.c\n"
+ "+++ b/mm/huge_memory.c\n"
+ "@@ -2239,6 +2239,7 @@ static void __split_huge_page_tail(struct page *head, int tail,\n"
+ " \t\t\t (1L << PG_mlocked) |\n"
+ " \t\t\t (1L << PG_uptodate) |\n"
+ " \t\t\t (1L << PG_active) |\n"
+ "+\t\t\t (1L << PG_workingset) |\n"
+ " \t\t\t (1L << PG_locked) |\n"
+ " \t\t\t (1L << PG_unevictable) |\n"
+ " \t\t\t (1L << PG_dirty)));\n"
+ "diff --git a/mm/memcontrol.c b/mm/memcontrol.c\n"
+ "index e09741af816f..93b2eb063afd 100644\n"
+ "--- a/mm/memcontrol.c\n"
+ "+++ b/mm/memcontrol.c\n"
+ "@@ -5274,6 +5274,8 @@ static int memory_stat_show(struct seq_file *m, void *v)\n"
+ " \t\t   stat[WORKINGSET_REFAULT]);\n"
+ " \tseq_printf(m, \"workingset_activate %lu\\n\",\n"
+ " \t\t   stat[WORKINGSET_ACTIVATE]);\n"
+ "+\tseq_printf(m, \"workingset_restore %lu\\n\",\n"
+ "+\t\t   stat[WORKINGSET_RESTORE]);\n"
+ " \tseq_printf(m, \"workingset_nodereclaim %lu\\n\",\n"
+ " \t\t   stat[WORKINGSET_NODERECLAIM]);\n"
+ " \n"
+ "diff --git a/mm/migrate.c b/mm/migrate.c\n"
+ "index e84eeb4e4356..48f4a79869ce 100644\n"
+ "--- a/mm/migrate.c\n"
+ "+++ b/mm/migrate.c\n"
+ "@@ -624,6 +624,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)\n"
+ " \t\tSetPageActive(newpage);\n"
+ " \t} else if (TestClearPageUnevictable(page))\n"
+ " \t\tSetPageUnevictable(newpage);\n"
+ "+\tif (PageWorkingset(page))\n"
+ "+\t\tSetPageWorkingset(newpage);\n"
+ " \tif (PageChecked(page))\n"
+ " \t\tSetPageChecked(newpage);\n"
+ " \tif (PageMappedToDisk(page))\n"
+ "diff --git a/mm/swap_state.c b/mm/swap_state.c\n"
+ "index b68c93014f50..b39b3969be07 100644\n"
+ "--- a/mm/swap_state.c\n"
+ "+++ b/mm/swap_state.c\n"
+ "@@ -387,6 +387,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,\n"
+ " \t\t\t/*\n"
+ " \t\t\t * Initiate read into locked page and return.\n"
+ " \t\t\t */\n"
+ "+\t\t\tSetPageWorkingset(new_page);\n"
+ " \t\t\tlru_cache_add_anon(new_page);\n"
+ " \t\t\t*new_page_allocated = true;\n"
+ " \t\t\treturn new_page;\n"
+ "diff --git a/mm/vmscan.c b/mm/vmscan.c\n"
+ "index a1af041930a6..60357cd84c67 100644\n"
+ "--- a/mm/vmscan.c\n"
+ "+++ b/mm/vmscan.c\n"
+ "@@ -2022,6 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan,\n"
+ " \t\t}\n"
+ " \n"
+ " \t\tClearPageActive(page);\t/* we are de-activating */\n"
+ "+\t\tSetPageWorkingset(page);\n"
+ " \t\tlist_add(&page->lru, &l_inactive);\n"
+ " \t}\n"
+ " \n"
+ "diff --git a/mm/vmstat.c b/mm/vmstat.c\n"
+ "index 9a4441bbeef2..87ce53498828 100644\n"
+ "--- a/mm/vmstat.c\n"
+ "+++ b/mm/vmstat.c\n"
+ "@@ -956,6 +956,7 @@ const char * const vmstat_text[] = {\n"
+ " \t\"nr_isolated_file\",\n"
+ " \t\"workingset_refault\",\n"
+ " \t\"workingset_activate\",\n"
+ "+\t\"workingset_restore\",\n"
+ " \t\"workingset_nodereclaim\",\n"
+ " \t\"nr_anon_pages\",\n"
+ " \t\"nr_mapped\",\n"
+ "diff --git a/mm/workingset.c b/mm/workingset.c\n"
+ "index 7119cd745ace..264f0498f2bc 100644\n"
+ "--- a/mm/workingset.c\n"
+ "+++ b/mm/workingset.c\n"
+ "@@ -120,7 +120,7 @@\n"
+ "  * the only thing eating into inactive list space is active pages.\n"
+ "  *\n"
+ "  *\n"
+ "- *\t\tActivating refaulting pages\n"
+ "+ *\t\tRefaulting inactive pages\n"
+ "  *\n"
+ "  * All that is known about the active list is that the pages have been\n"
+ "  * accessed more than once in the past.  This means that at any given\n"
+ "@@ -133,6 +133,10 @@\n"
+ "  * used less frequently than the refaulting page - or even not used at\n"
+ "  * all anymore.\n"
+ "  *\n"
+ "+ * That means if inactive cache is refaulting with a suitable refault\n"
+ "+ * distance, we assume the cache workingset is transitioning and put\n"
+ "+ * pressure on the current active list.\n"
+ "+ *\n"
+ "  * If this is wrong and demotion kicks in, the pages which are truly\n"
+ "  * used more frequently will be reactivated while the less frequently\n"
+ "  * used once will be evicted from memory.\n"
+ "@@ -140,6 +144,14 @@\n"
+ "  * But if this is right, the stale pages will be pushed out of memory\n"
+ "  * and the used pages get to stay in cache.\n"
+ "  *\n"
+ "+ *\t\tRefaulting active pages\n"
+ "+ *\n"
+ "+ * If on the other hand the refaulting pages have recently been\n"
+ "+ * deactivated, it means that the active list is no longer protecting\n"
+ "+ * actively used cache from reclaim. The cache is NOT transitioning to\n"
+ "+ * a different workingset; the existing workingset is thrashing in the\n"
+ "+ * space allocated to the page cache.\n"
+ "+ *\n"
+ "  *\n"
+ "  *\t\tImplementation\n"
+ "  *\n"
+ "@@ -155,8 +167,7 @@\n"
+ "  */\n"
+ " \n"
+ " #define EVICTION_SHIFT\t(RADIX_TREE_EXCEPTIONAL_ENTRY + \\\n"
+ "-\t\t\t NODES_SHIFT +\t\\\n"
+ "-\t\t\t MEM_CGROUP_ID_SHIFT)\n"
+ "+\t\t\t 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)\n"
+ " #define EVICTION_MASK\t(~0UL >> EVICTION_SHIFT)\n"
+ " \n"
+ " /*\n"
+ "@@ -169,23 +180,28 @@\n"
+ "  */\n"
+ " static unsigned int bucket_order __read_mostly;\n"
+ " \n"
+ "-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)\n"
+ "+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,\n"
+ "+\t\t\t bool workingset)\n"
+ " {\n"
+ " \teviction >>= bucket_order;\n"
+ " \teviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;\n"
+ " \teviction = (eviction << NODES_SHIFT) | pgdat->node_id;\n"
+ "+\teviction = (eviction << 1) | workingset;\n"
+ " \teviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);\n"
+ " \n"
+ " \treturn (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);\n"
+ " }\n"
+ " \n"
+ " static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,\n"
+ "-\t\t\t  unsigned long *evictionp)\n"
+ "+\t\t\t  unsigned long *evictionp, bool *workingsetp)\n"
+ " {\n"
+ " \tunsigned long entry = (unsigned long)shadow;\n"
+ " \tint memcgid, nid;\n"
+ "+\tbool workingset;\n"
+ " \n"
+ " \tentry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;\n"
+ "+\tworkingset = entry & 1;\n"
+ "+\tentry >>= 1;\n"
+ " \tnid = entry & ((1UL << NODES_SHIFT) - 1);\n"
+ " \tentry >>= NODES_SHIFT;\n"
+ " \tmemcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);\n"
+ "@@ -194,6 +210,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,\n"
+ " \t*memcgidp = memcgid;\n"
+ " \t*pgdat = NODE_DATA(nid);\n"
+ " \t*evictionp = entry << bucket_order;\n"
+ "+\t*workingsetp = workingset;\n"
+ " }\n"
+ " \n"
+ " /**\n"
+ "@@ -206,8 +223,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,\n"
+ "  */\n"
+ " void *workingset_eviction(struct address_space *mapping, struct page *page)\n"
+ " {\n"
+ "-\tstruct mem_cgroup *memcg = page_memcg(page);\n"
+ " \tstruct pglist_data *pgdat = page_pgdat(page);\n"
+ "+\tstruct mem_cgroup *memcg = page_memcg(page);\n"
+ " \tint memcgid = mem_cgroup_id(memcg);\n"
+ " \tunsigned long eviction;\n"
+ " \tstruct lruvec *lruvec;\n"
+ "@@ -219,30 +236,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)\n"
+ " \n"
+ " \tlruvec = mem_cgroup_lruvec(pgdat, memcg);\n"
+ " \teviction = atomic_long_inc_return(&lruvec->inactive_age);\n"
+ "-\treturn pack_shadow(memcgid, pgdat, eviction);\n"
+ "+\treturn pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));\n"
+ " }\n"
+ " \n"
+ " /**\n"
+ "  * workingset_refault - evaluate the refault of a previously evicted page\n"
+ "+ * @page: the freshly allocated replacement page\n"
+ "  * @shadow: shadow entry of the evicted page\n"
+ "  *\n"
+ "  * Calculates and evaluates the refault distance of the previously\n"
+ "  * evicted page in the context of the node it was allocated in.\n"
+ "- *\n"
+ "- * Returns %true if the page should be activated, %false otherwise.\n"
+ "  */\n"
+ "-bool workingset_refault(void *shadow)\n"
+ "+void workingset_refault(struct page *page, void *shadow)\n"
+ " {\n"
+ " \tunsigned long refault_distance;\n"
+ "+\tstruct pglist_data *pgdat;\n"
+ " \tunsigned long active_file;\n"
+ " \tstruct mem_cgroup *memcg;\n"
+ " \tunsigned long eviction;\n"
+ " \tstruct lruvec *lruvec;\n"
+ " \tunsigned long refault;\n"
+ "-\tstruct pglist_data *pgdat;\n"
+ "+\tbool workingset;\n"
+ " \tint memcgid;\n"
+ " \n"
+ "-\tunpack_shadow(shadow, &memcgid, &pgdat, &eviction);\n"
+ "+\tunpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);\n"
+ " \n"
+ " \trcu_read_lock();\n"
+ " \t/*\n"
+ "@@ -262,41 +279,50 @@ bool workingset_refault(void *shadow)\n"
+ " \t * configurations instead.\n"
+ " \t */\n"
+ " \tmemcg = mem_cgroup_from_id(memcgid);\n"
+ "-\tif (!mem_cgroup_disabled() && !memcg) {\n"
+ "-\t\trcu_read_unlock();\n"
+ "-\t\treturn false;\n"
+ "-\t}\n"
+ "+\tif (!mem_cgroup_disabled() && !memcg)\n"
+ "+\t\tgoto out;\n"
+ " \tlruvec = mem_cgroup_lruvec(pgdat, memcg);\n"
+ " \trefault = atomic_long_read(&lruvec->inactive_age);\n"
+ " \tactive_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);\n"
+ " \n"
+ " \t/*\n"
+ "-\t * The unsigned subtraction here gives an accurate distance\n"
+ "-\t * across inactive_age overflows in most cases.\n"
+ "+\t * Calculate the refault distance\n"
+ " \t *\n"
+ "-\t * There is a special case: usually, shadow entries have a\n"
+ "-\t * short lifetime and are either refaulted or reclaimed along\n"
+ "-\t * with the inode before they get too old.  But it is not\n"
+ "-\t * impossible for the inactive_age to lap a shadow entry in\n"
+ "-\t * the field, which can then can result in a false small\n"
+ "-\t * refault distance, leading to a false activation should this\n"
+ "-\t * old entry actually refault again.  However, earlier kernels\n"
+ "-\t * used to deactivate unconditionally with *every* reclaim\n"
+ "-\t * invocation for the longest time, so the occasional\n"
+ "-\t * inappropriate activation leading to pressure on the active\n"
+ "-\t * list is not a problem.\n"
+ "+\t * The unsigned subtraction here gives an accurate distance\n"
+ "+\t * across inactive_age overflows in most cases. There is a\n"
+ "+\t * special case: usually, shadow entries have a short lifetime\n"
+ "+\t * and are either refaulted or reclaimed along with the inode\n"
+ "+\t * before they get too old.  But it is not impossible for the\n"
+ "+\t * inactive_age to lap a shadow entry in the field, which can\n"
+ "+\t * then can result in a false small refault distance, leading\n"
+ "+\t * to a false activation should this old entry actually\n"
+ "+\t * refault again.  However, earlier kernels used to deactivate\n"
+ "+\t * unconditionally with *every* reclaim invocation for the\n"
+ "+\t * longest time, so the occasional inappropriate activation\n"
+ "+\t * leading to pressure on the active list is not a problem.\n"
+ " \t */\n"
+ " \trefault_distance = (refault - eviction) & EVICTION_MASK;\n"
+ " \n"
+ " \tinc_lruvec_state(lruvec, WORKINGSET_REFAULT);\n"
+ " \n"
+ "-\tif (refault_distance <= active_file) {\n"
+ "-\t\tinc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);\n"
+ "-\t\trcu_read_unlock();\n"
+ "-\t\treturn true;\n"
+ "-\t}\n"
+ "+\t/*\n"
+ "+\t * Compare the distance to the existing workingset size. We\n"
+ "+\t * don't act on pages that couldn't stay resident even if all\n"
+ "+\t * the memory was available to the page cache.\n"
+ "+\t */\n"
+ "+\tif (refault_distance > active_file)\n"
+ "+\t\tgoto out;\n"
+ "+\n"
+ "+\tSetPageActive(page);\n"
+ "+\tSetPageWorkingset(page);\n"
+ "+\tatomic_long_inc(&lruvec->inactive_age);\n"
+ "+\tinc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);\n"
+ "+\n"
+ "+\t/* Page was active prior to eviction */\n"
+ "+\tif (workingset)\n"
+ "+\t\tinc_lruvec_state(lruvec, WORKINGSET_RESTORE);\n"
+ "+out:\n"
+ " \trcu_read_unlock();\n"
+ "-\treturn false;\n"
+ " }\n"
+ " \n"
+ " /**\n"
+ "-- \n"
+ 2.14.1
+ "\01:4\0"
+ "fn\00003-mm-sched-memdelay-memory-health-interface-for-system.patch\0"
+ "b\0"
+ ">From c3e97f5daf99bcd54383eaab466c477dbb743dd9 Mon Sep 17 00:00:00 2001\n"
+ "From: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "Date: Mon, 5 Jun 2017 16:07:22 -0400\n"
+ "Subject: [PATCH 3/3] mm/sched: memdelay: memory health interface for systems\n"
+ " and workloads\n"
+ "\n"
+ "Linux doesn't have a useful metric to describe the memory health of a\n"
+ "system, a cgroup container, or individual tasks.\n"
+ "\n"
+ "When workloads are bigger than available memory, they spend a certain\n"
+ "amount of their time inside page reclaim, waiting on thrashing cache,\n"
+ "and swapping in. This has impact on latency, and depending on the CPU\n"
+ "capacity in the system can also translate to a decrease in throughput.\n"
+ "\n"
+ "While Linux exports some stats and counters for these events, it does\n"
+ "not quantify the true impact they have on throughput and latency. How\n"
+ "much of the execution time is spent unproductively? This is important\n"
+ "to know when sizing workloads to systems and containers. It also comes\n"
+ "in handy when evaluating the effectiveness and efficiency of the\n"
+ "kernel's memory management policies and heuristics.\n"
+ "\n"
+ "This patch implements a metric that quantifies memory pressure in a\n"
+ "unit that matters most to applications and does not rely on hardware\n"
+ "aspects to be meaningful: wallclock time lost while waiting on memory.\n"
+ "\n"
+ "Whenever a task is blocked on refaults, swapins, or direct reclaim,\n"
+ "the time it spends is accounted on the task level and aggregated into\n"
+ "a domain state along with other tasks on the system and cgroup level.\n"
+ "\n"
+ "Each task has a /proc/<pid>/memdelay file that lists the microseconds\n"
+ "the task has been delayed since it's been forked. That file can be\n"
+ "sampled periodically for recent delays, or before and after certain\n"
+ "operations to measure their memory-related latencies.\n"
+ "\n"
+ "On the system and cgroup-level, there are /proc/memdelay and\n"
+ "memory.memdelay, respectively, and their format is as such:\n"
+ "\n"
+ "$ cat /proc/memdelay\n"
+ "2489084\n"
+ "41.61 47.28 29.66\n"
+ "0.00 0.00 0.00\n"
+ "\n"
+ "The first line shows the cumulative delay times of all tasks in the\n"
+ "domain - in this case, all tasks in the system cumulatively lost 2.49\n"
+ "seconds due to memory delays.\n"
+ "\n"
+ "The second and third line show percentages spent in aggregate states\n"
+ "for the domain - system or cgroup - in a load average type format as\n"
+ "decaying averages over the last 1m, 5m, and 15m:\n"
+ "\n"
+ "The second line indicates the share of wall-time the domain spends in\n"
+ "a state where SOME tasks are delayed by memory while others are still\n"
+ "productive (runnable or iowait). This indicates a latency problem for\n"
+ "individual tasks, but since the CPU/IO capacity is still used, adding\n"
+ "more memory might not necessarily improve the domain's throughput.\n"
+ "\n"
+ "The third line indicates the share of wall-time the domain spends in a\n"
+ "state where ALL non-idle tasks are delayed by memory. In this state,\n"
+ "the domain is entirely unproductive due to a lack of memory.\n"
+ "\n"
+ "v2:\n"
+ "- fix active-delay condition when only other runnables, no iowait\n"
+ "- drop private lock from sched path, we can use the rq lock\n"
+ "- fix refault vs. simple lockwait detection\n"
+ "- drop ktime, we can use cpu_clock()\n"
+ "\n"
+ "XXX:\n"
+ "- eliminate redundant cgroup hierarchy walks in the scheduler\n"
+ "\n"
+ "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n"
+ "---\n"
+ " fs/proc/array.c            |   8 ++\n"
+ " fs/proc/base.c             |   2 +\n"
+ " fs/proc/internal.h         |   2 +\n"
+ " include/linux/memcontrol.h |  14 +++\n"
+ " include/linux/memdelay.h   | 182 +++++++++++++++++++++++++++++\n"
+ " include/linux/sched.h      |   8 ++\n"
+ " kernel/cgroup/cgroup.c     |   3 +-\n"
+ " kernel/fork.c              |   4 +\n"
+ " kernel/sched/Makefile      |   2 +-\n"
+ " kernel/sched/core.c        |  27 +++++\n"
+ " kernel/sched/memdelay.c    | 118 +++++++++++++++++++\n"
+ " mm/Makefile                |   2 +-\n"
+ " mm/compaction.c            |   4 +\n"
+ " mm/filemap.c               |  11 ++\n"
+ " mm/memcontrol.c            |  25 ++++\n"
+ " mm/memdelay.c              | 285 +++++++++++++++++++++++++++++++++++++++++++++\n"
+ " mm/page_alloc.c            |  11 +-\n"
+ " mm/vmscan.c                |   9 ++\n"
+ " 18 files changed, 712 insertions(+), 5 deletions(-)\n"
+ " create mode 100644 include/linux/memdelay.h\n"
+ " create mode 100644 kernel/sched/memdelay.c\n"
+ " create mode 100644 mm/memdelay.c\n"
+ "\n"
+ "diff --git a/fs/proc/array.c b/fs/proc/array.c\n"
+ "index 88c355574aa0..00e0e9aa3e70 100644\n"
+ "--- a/fs/proc/array.c\n"
+ "+++ b/fs/proc/array.c\n"
+ "@@ -611,6 +611,14 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,\n"
+ " \treturn 0;\n"
+ " }\n"
+ " \n"
+ "+int proc_pid_memdelay(struct seq_file *m, struct pid_namespace *ns,\n"
+ "+\t\t      struct pid *pid, struct task_struct *task)\n"
+ "+{\n"
+ "+\tseq_put_decimal_ull(m, \"\", task->memdelay_total);\n"
+ "+\tseq_putc(m, '\\n');\n"
+ "+\treturn 0;\n"
+ "+}\n"
+ "+\n"
+ " #ifdef CONFIG_PROC_CHILDREN\n"
+ " static struct pid *\n"
+ " get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)\n"
+ "diff --git a/fs/proc/base.c b/fs/proc/base.c\n"
+ "index 719c2e943ea1..19f194940c80 100644\n"
+ "--- a/fs/proc/base.c\n"
+ "+++ b/fs/proc/base.c\n"
+ "@@ -2916,6 +2916,7 @@ static const struct pid_entry tgid_base_stuff[] = {\n"
+ " \tREG(\"cmdline\",    S_IRUGO, proc_pid_cmdline_ops),\n"
+ " \tONE(\"stat\",       S_IRUGO, proc_tgid_stat),\n"
+ " \tONE(\"statm\",      S_IRUGO, proc_pid_statm),\n"
+ "+\tONE(\"memdelay\",   S_IRUGO, proc_pid_memdelay),\n"
+ " \tREG(\"maps\",       S_IRUGO, proc_pid_maps_operations),\n"
+ " #ifdef CONFIG_NUMA\n"
+ " \tREG(\"numa_maps\",  S_IRUGO, proc_pid_numa_maps_operations),\n"
+ "@@ -3307,6 +3308,7 @@ static const struct pid_entry tid_base_stuff[] = {\n"
+ " \tREG(\"cmdline\",   S_IRUGO, proc_pid_cmdline_ops),\n"
+ " \tONE(\"stat\",      S_IRUGO, proc_tid_stat),\n"
+ " \tONE(\"statm\",     S_IRUGO, proc_pid_statm),\n"
+ "+\tONE(\"memdelay\",  S_IRUGO, proc_pid_memdelay),\n"
+ " \tREG(\"maps\",      S_IRUGO, proc_tid_maps_operations),\n"
+ " #ifdef CONFIG_PROC_CHILDREN\n"
+ " \tREG(\"children\",  S_IRUGO, proc_tid_children_operations),\n"
+ "diff --git a/fs/proc/internal.h b/fs/proc/internal.h\n"
+ "index aa2b89071630..7ab706c316b8 100644\n"
+ "--- a/fs/proc/internal.h\n"
+ "+++ b/fs/proc/internal.h\n"
+ "@@ -146,6 +146,8 @@ extern int proc_pid_status(struct seq_file *, struct pid_namespace *,\n"
+ " \t\t\t   struct pid *, struct task_struct *);\n"
+ " extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,\n"
+ " \t\t\t  struct pid *, struct task_struct *);\n"
+ "+extern int proc_pid_memdelay(struct seq_file *, struct pid_namespace *,\n"
+ "+\t\t\t     struct pid *, struct task_struct *);\n"
+ " \n"
+ " /*\n"
+ "  * base.c\n"
+ "diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h\n"
+ "index 9b15a4bcfa77..1f720d3090f7 100644\n"
+ "--- a/include/linux/memcontrol.h\n"
+ "+++ b/include/linux/memcontrol.h\n"
+ "@@ -30,6 +30,7 @@\n"
+ " #include <linux/vmstat.h>\n"
+ " #include <linux/writeback.h>\n"
+ " #include <linux/page-flags.h>\n"
+ "+#include <linux/memdelay.h>\n"
+ " \n"
+ " struct mem_cgroup;\n"
+ " struct page;\n"
+ "@@ -183,6 +184,9 @@ struct mem_cgroup {\n"
+ " \n"
+ " \tunsigned long soft_limit;\n"
+ " \n"
+ "+\t/* Memory delay measurement domain */\n"
+ "+\tstruct memdelay_domain *memdelay_domain;\n"
+ "+\n"
+ " \t/* vmpressure notifications */\n"
+ " \tstruct vmpressure vmpressure;\n"
+ " \n"
+ "@@ -728,6 +732,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,\n"
+ " \treturn &pgdat->lruvec;\n"
+ " }\n"
+ " \n"
+ "+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)\n"
+ "+{\n"
+ "+\treturn NULL;\n"
+ "+}\n"
+ "+\n"
+ " static inline bool mm_match_cgroup(struct mm_struct *mm,\n"
+ " \t\tstruct mem_cgroup *memcg)\n"
+ " {\n"
+ "@@ -740,6 +749,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,\n"
+ " \treturn true;\n"
+ " }\n"
+ " \n"
+ "+static inline struct mem_cgroup *mem_cgroup_from_task(struct task_struct *task)\n"
+ "+{\n"
+ "+\treturn NULL;\n"
+ "+}\n"
+ "+\n"
+ " static inline struct mem_cgroup *\n"
+ " mem_cgroup_iter(struct mem_cgroup *root,\n"
+ " \t\tstruct mem_cgroup *prev,\n"
+ "diff --git a/include/linux/memdelay.h b/include/linux/memdelay.h\n"
+ "new file mode 100644\n"
+ "index 000000000000..08ed4e4baedf\n"
+ "--- /dev/null\n"
+ "+++ b/include/linux/memdelay.h\n"
+ "@@ -0,0 +1,182 @@\n"
+ "+#ifndef _LINUX_MEMDELAY_H\n"
+ "+#define _LINUX_MEMDELAY_H\n"
+ "+\n"
+ "+#include <linux/spinlock_types.h>\n"
+ "+#include <linux/sched.h>\n"
+ "+\n"
+ "+struct seq_file;\n"
+ "+struct css_set;\n"
+ "+\n"
+ "+/*\n"
+ "+ * Task productivity states tracked by the scheduler\n"
+ "+ */\n"
+ "+enum memdelay_task_state {\n"
+ "+\tMTS_NONE,\t\t/* Idle/unqueued/untracked */\n"
+ "+\tMTS_IOWAIT,\t\t/* Waiting for IO, not memory delayed */\n"
+ "+\tMTS_RUNNABLE,\t\t/* On the runqueue, not memory delayed */\n"
+ "+\tMTS_DELAYED,\t\t/* Memory delayed, not running */\n"
+ "+\tMTS_DELAYED_ACTIVE,\t/* Memory delayed, actively running */\n"
+ "+\tNR_MEMDELAY_TASK_STATES,\n"
+ "+};\n"
+ "+\n"
+ "+/*\n"
+ "+ * System/cgroup delay state tracked by the VM, composed of the\n"
+ "+ * productivity states of all tasks inside the domain.\n"
+ "+ */\n"
+ "+enum memdelay_domain_state {\n"
+ "+\tMDS_NONE,\t\t/* No delayed tasks */\n"
+ "+\tMDS_SOME,\t\t/* Delayed tasks, working tasks */\n"
+ "+\tMDS_FULL,\t\t/* Delayed tasks, no working tasks */\n"
+ "+\tNR_MEMDELAY_DOMAIN_STATES,\n"
+ "+};\n"
+ "+\n"
+ "+struct memdelay_domain_cpu {\n"
+ "+\t/* Task states of the domain on this CPU */\n"
+ "+\tint tasks[NR_MEMDELAY_TASK_STATES];\n"
+ "+\n"
+ "+\t/* Delay state of the domain on this CPU */\n"
+ "+\tenum memdelay_domain_state state;\n"
+ "+\n"
+ "+\t/* Time of last state change */\n"
+ "+\tu64 state_start;\n"
+ "+};\n"
+ "+\n"
+ "+struct memdelay_domain {\n"
+ "+\t/* Aggregate delayed time of all domain tasks */\n"
+ "+\tunsigned long aggregate;\n"
+ "+\n"
+ "+\t/* Per-CPU delay states in the domain */\n"
+ "+\tstruct memdelay_domain_cpu __percpu *mdcs;\n"
+ "+\n"
+ "+\t/* Cumulative state times from all CPUs */\n"
+ "+\tunsigned long times[NR_MEMDELAY_DOMAIN_STATES];\n"
+ "+\n"
+ "+\t/* Decaying state time averages over 1m, 5m, 15m */\n"
+ "+\tunsigned long period_expires;\n"
+ "+\tunsigned long avg_full[3];\n"
+ "+\tunsigned long avg_some[3];\n"
+ "+};\n"
+ "+\n"
+ "+/* mm/memdelay.c */\n"
+ "+extern struct memdelay_domain memdelay_global_domain;\n"
+ "+void memdelay_init(void);\n"
+ "+void memdelay_task_change(struct task_struct *task,\n"
+ "+\t\t\t  enum memdelay_task_state old,\n"
+ "+\t\t\t  enum memdelay_task_state new);\n"
+ "+struct memdelay_domain *memdelay_domain_alloc(void);\n"
+ "+void memdelay_domain_free(struct memdelay_domain *md);\n"
+ "+int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md);\n"
+ "+\n"
+ "+/* kernel/sched/memdelay.c */\n"
+ "+void memdelay_enter(unsigned long *flags);\n"
+ "+void memdelay_leave(unsigned long *flags);\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_schedule - note a context switch\n"
+ "+ * @prev: task scheduling out\n"
+ "+ * @next: task scheduling in\n"
+ "+ *\n"
+ "+ * A task switch doesn't affect the balance between delayed and\n"
+ "+ * productive tasks, but we have to update whether the delay is\n"
+ "+ * actively using the CPU or not.\n"
+ "+ */\n"
+ "+static inline void memdelay_schedule(struct task_struct *prev,\n"
+ "+\t\t\t\t     struct task_struct *next)\n"
+ "+{\n"
+ "+\tif (prev->flags & PF_MEMDELAY)\n"
+ "+\t\tmemdelay_task_change(prev, MTS_DELAYED_ACTIVE, MTS_DELAYED);\n"
+ "+\n"
+ "+\tif (next->flags & PF_MEMDELAY)\n"
+ "+\t\tmemdelay_task_change(next, MTS_DELAYED, MTS_DELAYED_ACTIVE);\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_wakeup - note a task waking up\n"
+ "+ * @task: the task\n"
+ "+ *\n"
+ "+ * Notes an idle task becoming productive. Delayed tasks remain\n"
+ "+ * delayed even when they become runnable.\n"
+ "+ */\n"
+ "+static inline void memdelay_wakeup(struct task_struct *task)\n"
+ "+{\n"
+ "+\tif (task->flags & PF_MEMDELAY)\n"
+ "+\t\treturn;\n"
+ "+\n"
+ "+\tif (task->in_iowait)\n"
+ "+\t\tmemdelay_task_change(task, MTS_IOWAIT, MTS_RUNNABLE);\n"
+ "+\telse\n"
+ "+\t\tmemdelay_task_change(task, MTS_NONE, MTS_RUNNABLE);\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_wakeup - note a task going to sleep\n"
+ "+ * @task: the task\n"
+ "+ *\n"
+ "+ * Notes a working tasks becoming unproductive. Delayed tasks remain\n"
+ "+ * delayed.\n"
+ "+ */\n"
+ "+static inline void memdelay_sleep(struct task_struct *task)\n"
+ "+{\n"
+ "+\tif (task->flags & PF_MEMDELAY)\n"
+ "+\t\treturn;\n"
+ "+\n"
+ "+\tif (task->in_iowait)\n"
+ "+\t\tmemdelay_task_change(task, MTS_RUNNABLE, MTS_IOWAIT);\n"
+ "+\telse\n"
+ "+\t\tmemdelay_task_change(task, MTS_RUNNABLE, MTS_NONE);\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_del_add - track task movement between runqueues\n"
+ "+ * @task: the task\n"
+ "+ * @runnable: a runnable task is moved if %true, unqueued otherwise\n"
+ "+ * @add: task is being added if %true, removed otherwise\n"
+ "+ *\n"
+ "+ * Update the memdelay domain per-cpu states as tasks are being moved\n"
+ "+ * around the runqueues.\n"
+ "+ */\n"
+ "+static inline void memdelay_del_add(struct task_struct *task,\n"
+ "+\t\t\t\t    bool runnable, bool add)\n"
+ "+{\n"
+ "+\tint state;\n"
+ "+\n"
+ "+\tif (task->flags & PF_MEMDELAY)\n"
+ "+\t\tstate = MTS_DELAYED;\n"
+ "+\telse if (runnable)\n"
+ "+\t\tstate = MTS_RUNNABLE;\n"
+ "+\telse if (task->in_iowait)\n"
+ "+\t\tstate = MTS_IOWAIT;\n"
+ "+\telse\n"
+ "+\t\treturn; /* already MTS_NONE */\n"
+ "+\n"
+ "+\tif (add)\n"
+ "+\t\tmemdelay_task_change(task, MTS_NONE, state);\n"
+ "+\telse\n"
+ "+\t\tmemdelay_task_change(task, state, MTS_NONE);\n"
+ "+}\n"
+ "+\n"
+ "+static inline void memdelay_del_runnable(struct task_struct *task)\n"
+ "+{\n"
+ "+\tmemdelay_del_add(task, true, false);\n"
+ "+}\n"
+ "+\n"
+ "+static inline void memdelay_add_runnable(struct task_struct *task)\n"
+ "+{\n"
+ "+\tmemdelay_del_add(task, true, true);\n"
+ "+}\n"
+ "+\n"
+ "+static inline void memdelay_del_sleeping(struct task_struct *task)\n"
+ "+{\n"
+ "+\tmemdelay_del_add(task, false, false);\n"
+ "+}\n"
+ "+\n"
+ "+static inline void memdelay_add_sleeping(struct task_struct *task)\n"
+ "+{\n"
+ "+\tmemdelay_del_add(task, false, true);\n"
+ "+}\n"
+ "+\n"
+ "+#ifdef CONFIG_CGROUPS\n"
+ "+void cgroup_move_task(struct task_struct *task, struct css_set *to);\n"
+ "+#endif\n"
+ "+\n"
+ "+#endif /* _LINUX_MEMDELAY_H */\n"
+ "diff --git a/include/linux/sched.h b/include/linux/sched.h\n"
+ "index c05ac5f5aa03..de15e3c8c43a 100644\n"
+ "--- a/include/linux/sched.h\n"
+ "+++ b/include/linux/sched.h\n"
+ "@@ -651,6 +651,7 @@ struct task_struct {\n"
+ " \t/* disallow userland-initiated cgroup migration */\n"
+ " \tunsigned\t\t\tno_cgroup_migration:1;\n"
+ " #endif\n"
+ "+\tunsigned\t\t\tmemdelay_migrate_enqueue:1;\n"
+ " \n"
+ " \tunsigned long\t\t\tatomic_flags; /* Flags requiring atomic access. */\n"
+ " \n"
+ "@@ -871,6 +872,12 @@ struct task_struct {\n"
+ " \n"
+ " \tstruct io_context\t\t*io_context;\n"
+ " \n"
+ "+\tu64\t\t\t\tmemdelay_start;\n"
+ "+\tunsigned long\t\t\tmemdelay_total;\n"
+ "+#ifdef CONFIG_DEBUG_VM\n"
+ "+\tint\t\t\t\tmemdelay_state;\n"
+ "+#endif\n"
+ "+\n"
+ " \t/* Ptrace state: */\n"
+ " \tunsigned long\t\t\tptrace_message;\n"
+ " \tsiginfo_t\t\t\t*last_siginfo;\n"
+ "@@ -1274,6 +1281,7 @@ extern struct pid *cad_pid;\n"
+ " #define PF_KTHREAD\t\t0x00200000\t/* I am a kernel thread */\n"
+ " #define PF_RANDOMIZE\t\t0x00400000\t/* Randomize virtual address space */\n"
+ " #define PF_SWAPWRITE\t\t0x00800000\t/* Allowed to write to swap */\n"
+ "+#define PF_MEMDELAY\t\t0x01000000\t/* Delayed due to lack of memory */\n"
+ " #define PF_NO_SETAFFINITY\t0x04000000\t/* Userland is not allowed to meddle with cpus_allowed */\n"
+ " #define PF_MCE_EARLY\t\t0x08000000      /* Early kill for mce process policy */\n"
+ " #define PF_MUTEX_TESTER\t\t0x20000000\t/* Thread belongs to the rt mutex tester */\n"
+ "diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c\n"
+ "index df2e0f14a95d..930aaef50396 100644\n"
+ "--- a/kernel/cgroup/cgroup.c\n"
+ "+++ b/kernel/cgroup/cgroup.c\n"
+ "@@ -699,7 +699,8 @@ static void css_set_move_task(struct task_struct *task,\n"
+ " \t\t */\n"
+ " \t\tWARN_ON_ONCE(task->flags & PF_EXITING);\n"
+ " \n"
+ "-\t\trcu_assign_pointer(task->cgroups, to_cset);\n"
+ "+\t\tcgroup_move_task(task, to_cset);\n"
+ "+\n"
+ " \t\tlist_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :\n"
+ " \t\t\t\t\t\t\t     &to_cset->tasks);\n"
+ " \t}\n"
+ "diff --git a/kernel/fork.c b/kernel/fork.c\n"
+ "index b7e9e57b71ea..96dd35393be9 100644\n"
+ "--- a/kernel/fork.c\n"
+ "+++ b/kernel/fork.c\n"
+ "@@ -1208,6 +1208,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)\n"
+ " \tint retval;\n"
+ " \n"
+ " \ttsk->min_flt = tsk->maj_flt = 0;\n"
+ "+\ttsk->memdelay_total = 0;\n"
+ "+#ifdef CONFIG_DEBUG_VM\n"
+ "+\ttsk->memdelay_state = 0;\n"
+ "+#endif\n"
+ " \ttsk->nvcsw = tsk->nivcsw = 0;\n"
+ " #ifdef CONFIG_DETECT_HUNG_TASK\n"
+ " \ttsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;\n"
+ "diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile\n"
+ "index 53f0164ed362..84390fc42f60 100644\n"
+ "--- a/kernel/sched/Makefile\n"
+ "+++ b/kernel/sched/Makefile\n"
+ "@@ -17,7 +17,7 @@ endif\n"
+ " \n"
+ " obj-y += core.o loadavg.o clock.o cputime.o\n"
+ " obj-y += idle_task.o fair.o rt.o deadline.o\n"
+ "-obj-y += wait.o wait_bit.o swait.o completion.o idle.o\n"
+ "+obj-y += wait.o wait_bit.o swait.o completion.o idle.o memdelay.o\n"
+ " obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o\n"
+ " obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o\n"
+ " obj-$(CONFIG_SCHEDSTATS) += stats.o\n"
+ "diff --git a/kernel/sched/core.c b/kernel/sched/core.c\n"
+ "index 0869b20fba81..bf105c870da6 100644\n"
+ "--- a/kernel/sched/core.c\n"
+ "+++ b/kernel/sched/core.c\n"
+ "@@ -26,6 +26,7 @@\n"
+ " #include <linux/profile.h>\n"
+ " #include <linux/security.h>\n"
+ " #include <linux/syscalls.h>\n"
+ "+#include <linux/memdelay.h>\n"
+ " \n"
+ " #include <asm/switch_to.h>\n"
+ " #include <asm/tlb.h>\n"
+ "@@ -759,6 +760,14 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)\n"
+ " \tif (!(flags & ENQUEUE_RESTORE))\n"
+ " \t\tsched_info_queued(rq, p);\n"
+ " \n"
+ "+\tWARN_ON_ONCE(!(flags & ENQUEUE_WAKEUP) && p->memdelay_migrate_enqueue);\n"
+ "+\tif (!(flags & ENQUEUE_WAKEUP) || p->memdelay_migrate_enqueue) {\n"
+ "+\t\tmemdelay_add_runnable(p);\n"
+ "+\t\tp->memdelay_migrate_enqueue = 0;\n"
+ "+\t} else {\n"
+ "+\t\tmemdelay_wakeup(p);\n"
+ "+\t}\n"
+ "+\n"
+ " \tp->sched_class->enqueue_task(rq, p, flags);\n"
+ " }\n"
+ " \n"
+ "@@ -770,6 +779,11 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)\n"
+ " \tif (!(flags & DEQUEUE_SAVE))\n"
+ " \t\tsched_info_dequeued(rq, p);\n"
+ " \n"
+ "+\tif (!(flags & DEQUEUE_SLEEP))\n"
+ "+\t\tmemdelay_del_runnable(p);\n"
+ "+\telse\n"
+ "+\t\tmemdelay_sleep(p);\n"
+ "+\n"
+ " \tp->sched_class->dequeue_task(rq, p, flags);\n"
+ " }\n"
+ " \n"
+ "@@ -2044,7 +2058,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)\n"
+ " \n"
+ " \tcpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);\n"
+ " \tif (task_cpu(p) != cpu) {\n"
+ "+\t\tstruct rq_flags rf;\n"
+ "+\t\tstruct rq *rq;\n"
+ "+\n"
+ " \t\twake_flags |= WF_MIGRATED;\n"
+ "+\n"
+ "+\t\trq = __task_rq_lock(p, &rf);\n"
+ "+\t\tmemdelay_del_sleeping(p);\n"
+ "+\t\t__task_rq_unlock(rq, &rf);\n"
+ "+\t\tp->memdelay_migrate_enqueue = 1;\n"
+ "+\n"
+ " \t\tset_task_cpu(p, cpu);\n"
+ " \t}\n"
+ " \n"
+ "@@ -3326,6 +3349,8 @@ static void __sched notrace __schedule(bool preempt)\n"
+ " \t\trq->curr = next;\n"
+ " \t\t++*switch_count;\n"
+ " \n"
+ "+\t\tmemdelay_schedule(prev, next);\n"
+ "+\n"
+ " \t\ttrace_sched_switch(preempt, prev, next);\n"
+ " \n"
+ " \t\t/* Also unlocks the rq: */\n"
+ "@@ -5919,6 +5944,8 @@ void __init sched_init(void)\n"
+ " \n"
+ " \tinit_schedstats();\n"
+ " \n"
+ "+\tmemdelay_init();\n"
+ "+\n"
+ " \tscheduler_running = 1;\n"
+ " }\n"
+ " \n"
+ "diff --git a/kernel/sched/memdelay.c b/kernel/sched/memdelay.c\n"
+ "new file mode 100644\n"
+ "index 000000000000..1d4813cd018a\n"
+ "--- /dev/null\n"
+ "+++ b/kernel/sched/memdelay.c\n"
+ "@@ -0,0 +1,118 @@\n"
+ "+/*\n"
+ "+ * Memory delay metric\n"
+ "+ *\n"
+ "+ * Copyright (c) 2017 Facebook, Johannes Weiner\n"
+ "+ *\n"
+ "+ * This code quantifies and reports to userspace the wall-time impact\n"
+ "+ * of memory pressure on the system and memory-controlled cgroups.\n"
+ "+ */\n"
+ "+\n"
+ "+#include <linux/memdelay.h>\n"
+ "+#include <linux/cgroup.h>\n"
+ "+#include <linux/sched.h>\n"
+ "+\n"
+ "+#include \"sched.h\"\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_enter - mark the beginning of a memory delay section\n"
+ "+ * @flags: flags to handle nested memdelay sections\n"
+ "+ *\n"
+ "+ * Marks the calling task as being delayed due to a lack of memory,\n"
+ "+ * such as waiting for a workingset refault or performing reclaim.\n"
+ "+ */\n"
+ "+void memdelay_enter(unsigned long *flags)\n"
+ "+{\n"
+ "+\tstruct rq_flags rf;\n"
+ "+\tstruct rq *rq;\n"
+ "+\n"
+ "+\t*flags = current->flags & PF_MEMDELAY;\n"
+ "+\tif (*flags)\n"
+ "+\t\treturn;\n"
+ "+\t/*\n"
+ "+\t * PF_MEMDELAY & accounting needs to be atomic wrt changes to\n"
+ "+\t * the task's scheduling state and its domain association.\n"
+ "+\t * Otherwise we could race with CPU or cgroup migration and\n"
+ "+\t * misaccount.\n"
+ "+\t */\n"
+ "+\tlocal_irq_disable();\n"
+ "+\trq = this_rq();\n"
+ "+\trq_lock(rq, &rf);\n"
+ "+\n"
+ "+\tcurrent->flags |= PF_MEMDELAY;\n"
+ "+\tmemdelay_task_change(current, MTS_RUNNABLE, MTS_DELAYED_ACTIVE);\n"
+ "+\n"
+ "+\trq_unlock(rq, &rf);\n"
+ "+\tlocal_irq_enable();\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_leave - mark the end of a memory delay section\n"
+ "+ * @flags: flags to handle nested memdelay sections\n"
+ "+ *\n"
+ "+ * Marks the calling task as no longer delayed due to memory.\n"
+ "+ */\n"
+ "+void memdelay_leave(unsigned long *flags)\n"
+ "+{\n"
+ "+\tstruct rq_flags rf;\n"
+ "+\tstruct rq *rq;\n"
+ "+\n"
+ "+\tif (*flags)\n"
+ "+\t\treturn;\n"
+ "+\t/*\n"
+ "+\t * PF_MEMDELAY & accounting needs to be atomic wrt changes to\n"
+ "+\t * the task's scheduling state and its domain association.\n"
+ "+\t * Otherwise we could race with CPU or cgroup migration and\n"
+ "+\t * misaccount.\n"
+ "+\t */\n"
+ "+\tlocal_irq_disable();\n"
+ "+\trq = this_rq();\n"
+ "+\trq_lock(rq, &rf);\n"
+ "+\n"
+ "+\tcurrent->flags &= ~PF_MEMDELAY;\n"
+ "+\tmemdelay_task_change(current, MTS_DELAYED_ACTIVE, MTS_RUNNABLE);\n"
+ "+\n"
+ "+\trq_unlock(rq, &rf);\n"
+ "+\tlocal_irq_enable();\n"
+ "+}\n"
+ "+\n"
+ "+#ifdef CONFIG_CGROUPS\n"
+ "+/**\n"
+ "+ * cgroup_move_task - move task to a different cgroup\n"
+ "+ * @task: the task\n"
+ "+ * @to: the target css_set\n"
+ "+ *\n"
+ "+ * Move task to a new cgroup and safely migrate its associated\n"
+ "+ * delayed/working state between the different domains.\n"
+ "+ *\n"
+ "+ * This function acquires the task's rq lock to lock out concurrent\n"
+ "+ * changes to the task's scheduling state and - in case the task is\n"
+ "+ * running - concurrent changes to its delay state.\n"
+ "+ */\n"
+ "+void cgroup_move_task(struct task_struct *task, struct css_set *to)\n"
+ "+{\n"
+ "+\tstruct rq_flags rf;\n"
+ "+\tstruct rq *rq;\n"
+ "+\tint state;\n"
+ "+\n"
+ "+\trq = task_rq_lock(task, &rf);\n"
+ "+\n"
+ "+\tif (task->flags & PF_MEMDELAY)\n"
+ "+\t\tstate = MTS_DELAYED + task_current(rq, task);\n"
+ "+\telse if (task_on_rq_queued(task))\n"
+ "+\t\tstate = MTS_RUNNABLE;\n"
+ "+\telse if (task->in_iowait)\n"
+ "+\t\tstate = MTS_IOWAIT;\n"
+ "+\telse\n"
+ "+\t\tstate = MTS_NONE;\n"
+ "+\n"
+ "+\t/*\n"
+ "+\t * Lame to do this here, but the scheduler cannot be locked\n"
+ "+\t * from the outside, so we move cgroups from inside sched/.\n"
+ "+\t */\n"
+ "+\tmemdelay_task_change(task, state, MTS_NONE);\n"
+ "+\trcu_assign_pointer(task->cgroups, to);\n"
+ "+\tmemdelay_task_change(task, MTS_NONE, state);\n"
+ "+\n"
+ "+\ttask_rq_unlock(rq, task, &rf);\n"
+ "+}\n"
+ "+#endif /* CONFIG_CGROUPS */\n"
+ "diff --git a/mm/Makefile b/mm/Makefile\n"
+ "index 411bd24d4a7c..c9bdbc5627e5 100644\n"
+ "--- a/mm/Makefile\n"
+ "+++ b/mm/Makefile\n"
+ "@@ -39,7 +39,7 @@ obj-y\t\t\t:= filemap.o mempool.o oom_kill.o \\\n"
+ " \t\t\t   mm_init.o mmu_context.o percpu.o slab_common.o \\\n"
+ " \t\t\t   compaction.o vmacache.o swap_slots.o \\\n"
+ " \t\t\t   interval_tree.o list_lru.o workingset.o \\\n"
+ "-\t\t\t   debug.o $(mmu-y)\n"
+ "+\t\t\t   memdelay.o debug.o $(mmu-y)\n"
+ " \n"
+ " obj-y += init-mm.o\n"
+ " \n"
+ "diff --git a/mm/compaction.c b/mm/compaction.c\n"
+ "index fb548e4c7bd4..adf67de23fee 100644\n"
+ "--- a/mm/compaction.c\n"
+ "+++ b/mm/compaction.c\n"
+ "@@ -2040,11 +2040,15 @@ static int kcompactd(void *p)\n"
+ " \tpgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;\n"
+ " \n"
+ " \twhile (!kthread_should_stop()) {\n"
+ "+\t\tunsigned long mdflags;\n"
+ "+\n"
+ " \t\ttrace_mm_compaction_kcompactd_sleep(pgdat->node_id);\n"
+ " \t\twait_event_freezable(pgdat->kcompactd_wait,\n"
+ " \t\t\t\tkcompactd_work_requested(pgdat));\n"
+ " \n"
+ "+\t\tmemdelay_enter(&mdflags);\n"
+ " \t\tkcompactd_do_work(pgdat);\n"
+ "+\t\tmemdelay_leave(&mdflags);\n"
+ " \t}\n"
+ " \n"
+ " \treturn 0;\n"
+ "diff --git a/mm/filemap.c b/mm/filemap.c\n"
+ "index da55a5693da9..648418694405 100644\n"
+ "--- a/mm/filemap.c\n"
+ "+++ b/mm/filemap.c\n"
+ "@@ -36,6 +36,7 @@\n"
+ " #include <linux/memcontrol.h>\n"
+ " #include <linux/cleancache.h>\n"
+ " #include <linux/rmap.h>\n"
+ "+#include <linux/memdelay.h>\n"
+ " #include \"internal.h\"\n"
+ " \n"
+ " #define CREATE_TRACE_POINTS\n"
+ "@@ -961,8 +962,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,\n"
+ " {\n"
+ " \tstruct wait_page_queue wait_page;\n"
+ " \twait_queue_entry_t *wait = &wait_page.wait;\n"
+ "+\tunsigned long mdflags;\n"
+ "+\tbool refault = false;\n"
+ " \tint ret = 0;\n"
+ " \n"
+ "+\tif (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) {\n"
+ "+\t\tmemdelay_enter(&mdflags);\n"
+ "+\t\trefault = true;\n"
+ "+\t}\n"
+ "+\n"
+ " \tinit_wait(wait);\n"
+ " \twait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;\n"
+ " \twait->func = wake_page_function;\n"
+ "@@ -1001,6 +1009,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,\n"
+ " \n"
+ " \tfinish_wait(q, wait);\n"
+ " \n"
+ "+\tif (refault)\n"
+ "+\t\tmemdelay_leave(&mdflags);\n"
+ "+\n"
+ " \t/*\n"
+ " \t * A signal could leave PageWaiters set. Clearing it here if\n"
+ " \t * !waitqueue_active would be possible (by open-coding finish_wait),\n"
+ "diff --git a/mm/memcontrol.c b/mm/memcontrol.c\n"
+ "index 93b2eb063afd..102f0f4d3f5c 100644\n"
+ "--- a/mm/memcontrol.c\n"
+ "+++ b/mm/memcontrol.c\n"
+ "@@ -65,6 +65,7 @@\n"
+ " #include <linux/lockdep.h>\n"
+ " #include <linux/file.h>\n"
+ " #include <linux/tracehook.h>\n"
+ "+#include <linux/memdelay.h>\n"
+ " #include \"internal.h\"\n"
+ " #include <net/sock.h>\n"
+ " #include <net/ip.h>\n"
+ "@@ -3926,6 +3927,8 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,\n"
+ " \treturn ret;\n"
+ " }\n"
+ " \n"
+ "+static int memory_memdelay_show(struct seq_file *m, void *v);\n"
+ "+\n"
+ " static struct cftype mem_cgroup_legacy_files[] = {\n"
+ " \t{\n"
+ " \t\t.name = \"usage_in_bytes\",\n"
+ "@@ -3993,6 +3996,10 @@ static struct cftype mem_cgroup_legacy_files[] = {\n"
+ " \t{\n"
+ " \t\t.name = \"pressure_level\",\n"
+ " \t},\n"
+ "+\t{\n"
+ "+\t\t.name = \"memdelay\",\n"
+ "+\t\t.seq_show = memory_memdelay_show,\n"
+ "+\t},\n"
+ " #ifdef CONFIG_NUMA\n"
+ " \t{\n"
+ " \t\t.name = \"numa_stat\",\n"
+ "@@ -4170,6 +4177,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)\n"
+ " \n"
+ " \tfor_each_node(node)\n"
+ " \t\tfree_mem_cgroup_per_node_info(memcg, node);\n"
+ "+\tmemdelay_domain_free(memcg->memdelay_domain);\n"
+ " \tfree_percpu(memcg->stat);\n"
+ " \tkfree(memcg);\n"
+ " }\n"
+ "@@ -4275,10 +4283,15 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)\n"
+ " \n"
+ " \t/* The following stuff does not apply to the root */\n"
+ " \tif (!parent) {\n"
+ "+\t\tmemcg->memdelay_domain = &memdelay_global_domain;\n"
+ " \t\troot_mem_cgroup = memcg;\n"
+ " \t\treturn &memcg->css;\n"
+ " \t}\n"
+ " \n"
+ "+\tmemcg->memdelay_domain = memdelay_domain_alloc();\n"
+ "+\tif (!memcg->memdelay_domain)\n"
+ "+\t\tgoto fail;\n"
+ "+\n"
+ " \terror = memcg_online_kmem(memcg);\n"
+ " \tif (error)\n"
+ " \t\tgoto fail;\n"
+ "@@ -5282,6 +5295,13 @@ static int memory_stat_show(struct seq_file *m, void *v)\n"
+ " \treturn 0;\n"
+ " }\n"
+ " \n"
+ "+static int memory_memdelay_show(struct seq_file *m, void *v)\n"
+ "+{\n"
+ "+\tstruct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));\n"
+ "+\n"
+ "+\treturn memdelay_domain_show(m, memcg->memdelay_domain);\n"
+ "+}\n"
+ "+\n"
+ " static struct cftype memory_files[] = {\n"
+ " \t{\n"
+ " \t\t.name = \"current\",\n"
+ "@@ -5317,6 +5337,11 @@ static struct cftype memory_files[] = {\n"
+ " \t\t.flags = CFTYPE_NOT_ON_ROOT,\n"
+ " \t\t.seq_show = memory_stat_show,\n"
+ " \t},\n"
+ "+\t{\n"
+ "+\t\t.name = \"memdelay\",\n"
+ "+\t\t.flags = CFTYPE_NOT_ON_ROOT,\n"
+ "+\t\t.seq_show = memory_memdelay_show,\n"
+ "+\t},\n"
+ " \t{ }\t/* terminate */\n"
+ " };\n"
+ " \n"
+ "diff --git a/mm/memdelay.c b/mm/memdelay.c\n"
+ "new file mode 100644\n"
+ "index 000000000000..c43d6f7ba22a\n"
+ "--- /dev/null\n"
+ "+++ b/mm/memdelay.c\n"
+ "@@ -0,0 +1,285 @@\n"
+ "+/*\n"
+ "+ * Memory delay metric\n"
+ "+ *\n"
+ "+ * Copyright (c) 2017 Facebook, Johannes Weiner\n"
+ "+ *\n"
+ "+ * This code quantifies and reports to userspace the wall-time impact\n"
+ "+ * of memory pressure on the system and memory-controlled cgroups.\n"
+ "+ */\n"
+ "+\n"
+ "+#include <linux/sched/loadavg.h>\n"
+ "+#include <linux/sched/clock.h>\n"
+ "+#include <linux/memcontrol.h>\n"
+ "+#include <linux/memdelay.h>\n"
+ "+#include <linux/seq_file.h>\n"
+ "+#include <linux/proc_fs.h>\n"
+ "+#include <linux/kernel.h>\n"
+ "+#include <linux/module.h>\n"
+ "+#include <linux/slab.h>\n"
+ "+#include <linux/fs.h>\n"
+ "+\n"
+ "+static DEFINE_PER_CPU(struct memdelay_domain_cpu, global_domain_cpus);\n"
+ "+\n"
+ "+/* System-level keeping of memory delay statistics */\n"
+ "+struct memdelay_domain memdelay_global_domain = {\n"
+ "+\t.mdcs = &global_domain_cpus,\n"
+ "+};\n"
+ "+\n"
+ "+static void domain_init(struct memdelay_domain *md)\n"
+ "+{\n"
+ "+\tmd->period_expires = jiffies + LOAD_FREQ;\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_init - initialize the memdelay subsystem\n"
+ "+ *\n"
+ "+ * This needs to run before the scheduler starts queuing and\n"
+ "+ * scheduling tasks.\n"
+ "+ */\n"
+ "+void __init memdelay_init(void)\n"
+ "+{\n"
+ "+\tdomain_init(&memdelay_global_domain);\n"
+ "+}\n"
+ "+\n"
+ "+static void domain_move_clock(struct memdelay_domain *md)\n"
+ "+{\n"
+ "+\tunsigned long expires = READ_ONCE(md->period_expires);\n"
+ "+\tunsigned long none, some, full;\n"
+ "+\tint missed_periods;\n"
+ "+\tunsigned long next;\n"
+ "+\tint i;\n"
+ "+\n"
+ "+\tif (time_before(jiffies, expires))\n"
+ "+\t\treturn;\n"
+ "+\n"
+ "+\tmissed_periods = 1 + (jiffies - expires) / LOAD_FREQ;\n"
+ "+\tnext = expires + (missed_periods * LOAD_FREQ);\n"
+ "+\n"
+ "+\tif (cmpxchg(&md->period_expires, expires, next) != expires)\n"
+ "+\t\treturn;\n"
+ "+\n"
+ "+\tnone = xchg(&md->times[MDS_NONE], 0);\n"
+ "+\tsome = xchg(&md->times[MDS_SOME], 0);\n"
+ "+\tfull = xchg(&md->times[MDS_FULL], 0);\n"
+ "+\n"
+ "+\tfor (i = 0; i < missed_periods; i++) {\n"
+ "+\t\tunsigned long pct;\n"
+ "+\n"
+ "+\t\tpct = some * 100 / max(none + some + full, 1UL);\n"
+ "+\t\tpct *= FIXED_1;\n"
+ "+\t\tCALC_LOAD(md->avg_some[0], EXP_1, pct);\n"
+ "+\t\tCALC_LOAD(md->avg_some[1], EXP_5, pct);\n"
+ "+\t\tCALC_LOAD(md->avg_some[2], EXP_15, pct);\n"
+ "+\n"
+ "+\t\tpct = full * 100 / max(none + some + full, 1UL);\n"
+ "+\t\tpct *= FIXED_1;\n"
+ "+\t\tCALC_LOAD(md->avg_full[0], EXP_1, pct);\n"
+ "+\t\tCALC_LOAD(md->avg_full[1], EXP_5, pct);\n"
+ "+\t\tCALC_LOAD(md->avg_full[2], EXP_15, pct);\n"
+ "+\n"
+ "+\t\tnone = some = full = 0;\n"
+ "+\t}\n"
+ "+}\n"
+ "+\n"
+ "+static void domain_cpu_update(struct memdelay_domain *md, int cpu,\n"
+ "+\t\t\t      enum memdelay_task_state old,\n"
+ "+\t\t\t      enum memdelay_task_state new)\n"
+ "+{\n"
+ "+\tenum memdelay_domain_state state;\n"
+ "+\tstruct memdelay_domain_cpu *mdc;\n"
+ "+\tunsigned long delta;\n"
+ "+\tu64 now;\n"
+ "+\n"
+ "+\tmdc = per_cpu_ptr(md->mdcs, cpu);\n"
+ "+\n"
+ "+\tif (old) {\n"
+ "+\t\tWARN_ONCE(!mdc->tasks[old], \"cpu=%d old=%d new=%d counter=%d\\n\",\n"
+ "+\t\t\t  cpu, old, new, mdc->tasks[old]);\n"
+ "+\t\tmdc->tasks[old] -= 1;\n"
+ "+\t}\n"
+ "+\tif (new)\n"
+ "+\t\tmdc->tasks[new] += 1;\n"
+ "+\n"
+ "+\t/*\n"
+ "+\t * The domain is somewhat delayed when a number of tasks are\n"
+ "+\t * delayed but there are still others running the workload.\n"
+ "+\t *\n"
+ "+\t * The domain is fully delayed when all non-idle tasks on the\n"
+ "+\t * CPU are delayed, or when a delayed task is actively running\n"
+ "+\t * and preventing productive tasks from making headway.\n"
+ "+\t *\n"
+ "+\t * The state times then add up over all CPUs in the domain: if\n"
+ "+\t * the domain is fully blocked on one CPU and there is another\n"
+ "+\t * one running the workload, the domain is considered fully\n"
+ "+\t * blocked 50% of the time.\n"
+ "+\t */\n"
+ "+\tif (mdc->tasks[MTS_DELAYED_ACTIVE] && !mdc->tasks[MTS_IOWAIT])\n"
+ "+\t\tstate = MDS_FULL;\n"
+ "+\telse if (mdc->tasks[MTS_DELAYED])\n"
+ "+\t\tstate = (mdc->tasks[MTS_RUNNABLE] || mdc->tasks[MTS_IOWAIT]) ?\n"
+ "+\t\t\tMDS_SOME : MDS_FULL;\n"
+ "+\telse\n"
+ "+\t\tstate = MDS_NONE;\n"
+ "+\n"
+ "+\tif (mdc->state == state)\n"
+ "+\t\treturn;\n"
+ "+\n"
+ "+\tnow = cpu_clock(cpu);\n"
+ "+\tdelta = (now - mdc->state_start) / NSEC_PER_USEC;\n"
+ "+\n"
+ "+\tdomain_move_clock(md);\n"
+ "+\tmd->times[mdc->state] += delta;\n"
+ "+\n"
+ "+\tmdc->state = state;\n"
+ "+\tmdc->state_start = now;\n"
+ "+}\n"
+ "+\n"
+ "+static struct memdelay_domain *memcg_domain(struct mem_cgroup *memcg)\n"
+ "+{\n"
+ "+#ifdef CONFIG_MEMCG\n"
+ "+\tif (!mem_cgroup_disabled())\n"
+ "+\t\treturn memcg->memdelay_domain;\n"
+ "+#endif\n"
+ "+\treturn &memdelay_global_domain;\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_task_change - note a task changing its delay/work state\n"
+ "+ * @task: the task changing state\n"
+ "+ * @old: old task state\n"
+ "+ * @new: new task state\n"
+ "+ *\n"
+ "+ * Updates the task's domain counters to reflect a change in the\n"
+ "+ * task's delayed/working state.\n"
+ "+ */\n"
+ "+void memdelay_task_change(struct task_struct *task,\n"
+ "+\t\t\t  enum memdelay_task_state old,\n"
+ "+\t\t\t  enum memdelay_task_state new)\n"
+ "+{\n"
+ "+\tint cpu = task_cpu(task);\n"
+ "+\tstruct mem_cgroup *memcg;\n"
+ "+\tunsigned long delay = 0;\n"
+ "+\n"
+ "+#ifdef CONFIG_DEBUG_VM\n"
+ "+\tWARN_ONCE(task->memdelay_state != old,\n"
+ "+\t\t  \"cpu=%d task=%p state=%d (in_iowait=%d PF_MEMDELAYED=%d) old=%d new=%d\\n\",\n"
+ "+\t\t  cpu, task, task->memdelay_state, task->in_iowait,\n"
+ "+\t\t  !!(task->flags & PF_MEMDELAY), old, new);\n"
+ "+\ttask->memdelay_state = new;\n"
+ "+#endif\n"
+ "+\n"
+ "+\t/* Account when tasks are entering and leaving delays */\n"
+ "+\tif (old < MTS_DELAYED && new >= MTS_DELAYED) {\n"
+ "+\t\ttask->memdelay_start = cpu_clock(cpu);\n"
+ "+\t} else if (old >= MTS_DELAYED && new < MTS_DELAYED) {\n"
+ "+\t\tdelay = (cpu_clock(cpu) - task->memdelay_start) / NSEC_PER_USEC;\n"
+ "+\t\ttask->memdelay_total += delay;\n"
+ "+\t}\n"
+ "+\n"
+ "+\t/* Account domain state changes */\n"
+ "+\trcu_read_lock();\n"
+ "+\tmemcg = mem_cgroup_from_task(task);\n"
+ "+\tdo {\n"
+ "+\t\tstruct memdelay_domain *md;\n"
+ "+\n"
+ "+\t\tmd = memcg_domain(memcg);\n"
+ "+\t\tmd->aggregate += delay;\n"
+ "+\t\tdomain_cpu_update(md, cpu, old, new);\n"
+ "+\t} while (memcg && (memcg = parent_mem_cgroup(memcg)));\n"
+ "+\trcu_read_unlock();\n"
+ "+};\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_domain_alloc - allocate a cgroup memory delay domain\n"
+ "+ */\n"
+ "+struct memdelay_domain *memdelay_domain_alloc(void)\n"
+ "+{\n"
+ "+\tstruct memdelay_domain *md;\n"
+ "+\n"
+ "+\tmd = kzalloc(sizeof(*md), GFP_KERNEL);\n"
+ "+\tif (!md)\n"
+ "+\t\treturn NULL;\n"
+ "+\tmd->mdcs = alloc_percpu(struct memdelay_domain_cpu);\n"
+ "+\tif (!md->mdcs) {\n"
+ "+\t\tkfree(md);\n"
+ "+\t\treturn NULL;\n"
+ "+\t}\n"
+ "+\tdomain_init(md);\n"
+ "+\treturn md;\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_domain_free - free a cgroup memory delay domain\n"
+ "+ */\n"
+ "+void memdelay_domain_free(struct memdelay_domain *md)\n"
+ "+{\n"
+ "+\tif (md) {\n"
+ "+\t\tfree_percpu(md->mdcs);\n"
+ "+\t\tkfree(md);\n"
+ "+\t}\n"
+ "+}\n"
+ "+\n"
+ "+/**\n"
+ "+ * memdelay_domain_show - format memory delay domain stats to a seq_file\n"
+ "+ * @s: the seq_file\n"
+ "+ * @md: the memory domain\n"
+ "+ */\n"
+ "+int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md)\n"
+ "+{\n"
+ "+\tdomain_move_clock(md);\n"
+ "+\n"
+ "+\tseq_printf(s, \"%lu\\n\", md->aggregate);\n"
+ "+\n"
+ "+\tseq_printf(s, \"%lu.%02lu %lu.%02lu %lu.%02lu\\n\",\n"
+ "+\t\t   LOAD_INT(md->avg_some[0]), LOAD_FRAC(md->avg_some[0]),\n"
+ "+\t\t   LOAD_INT(md->avg_some[1]), LOAD_FRAC(md->avg_some[1]),\n"
+ "+\t\t   LOAD_INT(md->avg_some[2]), LOAD_FRAC(md->avg_some[2]));\n"
+ "+\n"
+ "+\tseq_printf(s, \"%lu.%02lu %lu.%02lu %lu.%02lu\\n\",\n"
+ "+\t\t   LOAD_INT(md->avg_full[0]), LOAD_FRAC(md->avg_full[0]),\n"
+ "+\t\t   LOAD_INT(md->avg_full[1]), LOAD_FRAC(md->avg_full[1]),\n"
+ "+\t\t   LOAD_INT(md->avg_full[2]), LOAD_FRAC(md->avg_full[2]));\n"
+ "+\n"
+ "+#ifdef CONFIG_DEBUG_VM\n"
+ "+\t{\n"
+ "+\t\tint cpu;\n"
+ "+\n"
+ "+\t\tfor_each_online_cpu(cpu) {\n"
+ "+\t\t\tstruct memdelay_domain_cpu *mdc;\n"
+ "+\n"
+ "+\t\t\tmdc = per_cpu_ptr(md->mdcs, cpu);\n"
+ "+\t\t\tseq_printf(s, \"%d %d %d %d\\n\",\n"
+ "+\t\t\t\t   mdc->tasks[MTS_IOWAIT],\n"
+ "+\t\t\t\t   mdc->tasks[MTS_RUNNABLE],\n"
+ "+\t\t\t\t   mdc->tasks[MTS_DELAYED],\n"
+ "+\t\t\t\t   mdc->tasks[MTS_DELAYED_ACTIVE]);\n"
+ "+\t\t}\n"
+ "+\t}\n"
+ "+#endif\n"
+ "+\n"
+ "+\treturn 0;\n"
+ "+}\n"
+ "+\n"
+ "+static int memdelay_show(struct seq_file *m, void *v)\n"
+ "+{\n"
+ "+\treturn memdelay_domain_show(m, &memdelay_global_domain);\n"
+ "+}\n"
+ "+\n"
+ "+static int memdelay_open(struct inode *inode, struct file *file)\n"
+ "+{\n"
+ "+\treturn single_open(file, memdelay_show, NULL);\n"
+ "+}\n"
+ "+\n"
+ "+static const struct file_operations memdelay_fops = {\n"
+ "+\t.open           = memdelay_open,\n"
+ "+\t.read           = seq_read,\n"
+ "+\t.llseek         = seq_lseek,\n"
+ "+\t.release        = single_release,\n"
+ "+};\n"
+ "+\n"
+ "+static int __init memdelay_proc_init(void)\n"
+ "+{\n"
+ "+\tproc_create(\"memdelay\", 0, NULL, &memdelay_fops);\n"
+ "+\treturn 0;\n"
+ "+}\n"
+ "+module_init(memdelay_proc_init);\n"
+ "diff --git a/mm/page_alloc.c b/mm/page_alloc.c\n"
+ "index 1423da8dd16f..d8d01e9df982 100644\n"
+ "--- a/mm/page_alloc.c\n"
+ "+++ b/mm/page_alloc.c\n"
+ "@@ -67,6 +67,7 @@\n"
+ " #include <linux/memcontrol.h>\n"
+ " #include <linux/ftrace.h>\n"
+ " #include <linux/nmi.h>\n"
+ "+#include <linux/memdelay.h>\n"
+ " \n"
+ " #include <asm/sections.h>\n"
+ " #include <asm/tlbflush.h>\n"
+ "@@ -3364,16 +3365,19 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,\n"
+ " \t\tunsigned int alloc_flags, const struct alloc_context *ac,\n"
+ " \t\tenum compact_priority prio, enum compact_result *compact_result)\n"
+ " {\n"
+ "-\tstruct page *page;\n"
+ " \tunsigned int noreclaim_flag;\n"
+ "+\tunsigned long mdflags;\n"
+ "+\tstruct page *page;\n"
+ " \n"
+ " \tif (!order)\n"
+ " \t\treturn NULL;\n"
+ " \n"
+ "+\tmemdelay_enter(&mdflags);\n"
+ " \tnoreclaim_flag = memalloc_noreclaim_save();\n"
+ " \t*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,\n"
+ " \t\t\t\t\t\t\t\t\tprio);\n"
+ " \tmemalloc_noreclaim_restore(noreclaim_flag);\n"
+ "+\tmemdelay_leave(&mdflags);\n"
+ " \n"
+ " \tif (*compact_result <= COMPACT_INACTIVE)\n"
+ " \t\treturn NULL;\n"
+ "@@ -3519,13 +3523,15 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,\n"
+ " \t\t\t\t\tconst struct alloc_context *ac)\n"
+ " {\n"
+ " \tstruct reclaim_state reclaim_state;\n"
+ "-\tint progress;\n"
+ " \tunsigned int noreclaim_flag;\n"
+ "+\tunsigned long mdflags;\n"
+ "+\tint progress;\n"
+ " \n"
+ " \tcond_resched();\n"
+ " \n"
+ " \t/* We now go into synchronous reclaim */\n"
+ " \tcpuset_memory_pressure_bump();\n"
+ "+\tmemdelay_enter(&mdflags);\n"
+ " \tnoreclaim_flag = memalloc_noreclaim_save();\n"
+ " \tlockdep_set_current_reclaim_state(gfp_mask);\n"
+ " \treclaim_state.reclaimed_slab = 0;\n"
+ "@@ -3537,6 +3543,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,\n"
+ " \tcurrent->reclaim_state = NULL;\n"
+ " \tlockdep_clear_current_reclaim_state();\n"
+ " \tmemalloc_noreclaim_restore(noreclaim_flag);\n"
+ "+\tmemdelay_leave(&mdflags);\n"
+ " \n"
+ " \tcond_resched();\n"
+ " \n"
+ "diff --git a/mm/vmscan.c b/mm/vmscan.c\n"
+ "index 60357cd84c67..1029305b9b3a 100644\n"
+ "--- a/mm/vmscan.c\n"
+ "+++ b/mm/vmscan.c\n"
+ "@@ -48,6 +48,7 @@\n"
+ " #include <linux/prefetch.h>\n"
+ " #include <linux/printk.h>\n"
+ " #include <linux/dax.h>\n"
+ "+#include <linux/memdelay.h>\n"
+ " \n"
+ " #include <asm/tlbflush.h>\n"
+ " #include <asm/div64.h>\n"
+ "@@ -3098,6 +3099,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,\n"
+ " {\n"
+ " \tstruct zonelist *zonelist;\n"
+ " \tunsigned long nr_reclaimed;\n"
+ "+\tunsigned long mdflags;\n"
+ " \tint nid;\n"
+ " \tunsigned int noreclaim_flag;\n"
+ " \tstruct scan_control sc = {\n"
+ "@@ -3126,9 +3128,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,\n"
+ " \t\t\t\t\t    sc.gfp_mask,\n"
+ " \t\t\t\t\t    sc.reclaim_idx);\n"
+ " \n"
+ "+\tmemdelay_enter(&mdflags);\n"
+ " \tnoreclaim_flag = memalloc_noreclaim_save();\n"
+ " \tnr_reclaimed = do_try_to_free_pages(zonelist, &sc);\n"
+ " \tmemalloc_noreclaim_restore(noreclaim_flag);\n"
+ "+\tmemdelay_leave(&mdflags);\n"
+ " \n"
+ " \ttrace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);\n"
+ " \n"
+ "@@ -3550,6 +3554,7 @@ static int kswapd(void *p)\n"
+ " \tpgdat->kswapd_order = 0;\n"
+ " \tpgdat->kswapd_classzone_idx = MAX_NR_ZONES;\n"
+ " \tfor ( ; ; ) {\n"
+ "+\t\tunsigned long mdflags;\n"
+ " \t\tbool ret;\n"
+ " \n"
+ " \t\talloc_order = reclaim_order = pgdat->kswapd_order;\n"
+ "@@ -3586,7 +3591,11 @@ static int kswapd(void *p)\n"
+ " \t\t */\n"
+ " \t\ttrace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,\n"
+ " \t\t\t\t\t\talloc_order);\n"
+ "+\n"
+ "+\t\tmemdelay_enter(&mdflags);\n"
+ " \t\treclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);\n"
+ "+\t\tmemdelay_leave(&mdflags);\n"
+ "+\n"
+ " \t\tif (reclaim_order < alloc_order)\n"
+ " \t\t\tgoto kswapd_try_sleep;\n"
+ " \t}\n"
+ "-- \n"
+ 2.14.1
 
-b4b1585f187ee8e2be73d0a832d73e143305734a4c5face8904b27985255f21f
+b24ffc48b913b11ce4c4452d3550c52327b6c13773cda91c3d49941302a848f0

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.