diff for duplicates of <20170918163434.GA11236@cmpxchg.org> diff --git a/a/2.txt b/N1/2.txt index 8b13789..e97ac5e 100644 --- a/a/2.txt +++ b/N1/2.txt @@ -1 +1,114 @@ +>From d5ffeb4d9d65fcff1b7e50dbde8264b4c32824a5 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner <hannes@cmpxchg.org> +Date: Wed, 14 Jun 2017 11:12:05 -0400 +Subject: [PATCH 1/3] sched/loadavg: consolidate LOAD_INT, LOAD_FRAC macros +There are several identical definitions of those macros in places that +mess with fixed-point load averages. Provide an official version. + +Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> +--- + arch/powerpc/platforms/cell/spufs/sched.c | 3 --- + arch/s390/appldata/appldata_os.c | 4 ---- + drivers/cpuidle/governors/menu.c | 4 ---- + fs/proc/loadavg.c | 3 --- + include/linux/sched/loadavg.h | 3 +++ + kernel/debug/kdb/kdb_main.c | 7 +------ + 6 files changed, 4 insertions(+), 20 deletions(-) + +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index 1fbb5da17dd2..de544070def3 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx, + } + } + +-#define LOAD_INT(x) ((x) >> FSHIFT) +-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +- + static int show_spu_loadavg(struct seq_file *s, void *private) + { + int a, b, c; +diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c +index 45b3178200ab..a8aac17e1e82 100644 +--- a/arch/s390/appldata/appldata_os.c ++++ b/arch/s390/appldata/appldata_os.c +@@ -24,10 +24,6 @@ + + #include "appldata.h" + +- +-#define LOAD_INT(x) ((x) >> FSHIFT) +-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +- + /* + * OS data + * +diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c +index 61b64c2b2cb8..e215a2c10a61 100644 +--- a/drivers/cpuidle/governors/menu.c ++++ b/drivers/cpuidle/governors/menu.c +@@ -132,10 +132,6 @@ struct menu_device { + int interval_ptr; + }; + +- +-#define LOAD_INT(x) ((x) >> FSHIFT) +-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +- + static inline int get_loadavg(unsigned long load) + { + return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; +diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c +index 983fce5c2418..111a25e4b088 100644 +--- a/fs/proc/loadavg.c ++++ b/fs/proc/loadavg.c +@@ -9,9 +9,6 @@ + #include <linux/seqlock.h> + #include <linux/time.h> + +-#define LOAD_INT(x) ((x) >> FSHIFT) +-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) +- + static int loadavg_proc_show(struct seq_file *m, void *v) + { + unsigned long avnrun[3]; +diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h +index 4264bc6b2c27..745483bb5cca 100644 +--- a/include/linux/sched/loadavg.h ++++ b/include/linux/sched/loadavg.h +@@ -26,6 +26,9 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + ++#define LOAD_INT(x) ((x) >> FSHIFT) ++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) ++ + extern void calc_global_load(unsigned long ticks); + + #endif /* _LINUX_SCHED_LOADAVG_H */ +diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c +index c8146d53ca67..2dddd25ccd7a 100644 +--- a/kernel/debug/kdb/kdb_main.c ++++ b/kernel/debug/kdb/kdb_main.c +@@ -2571,16 +2571,11 @@ static int kdb_summary(int argc, const char **argv) + } + kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); + +- /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ +- +-#define LOAD_INT(x) ((x) >> FSHIFT) +-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", + LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), + LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), + LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); +-#undef LOAD_INT +-#undef LOAD_FRAC ++ + /* Display in kilobytes */ + #define K(x) ((x) << (PAGE_SHIFT - 10)) + kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" +-- +2.14.1 diff --git a/N1/3.hdr b/N1/3.hdr new file mode 100644 index 0000000..c0eac16 --- /dev/null +++ b/N1/3.hdr @@ -0,0 +1,2 @@ +Content-Type: text/x-diff; charset=us-ascii +Content-Disposition: attachment; filename="0002-mm-workingset-tell-cache-transitions-from-workingset.patch" diff --git a/N1/3.txt b/N1/3.txt new file mode 100644 index 0000000..8419633 --- /dev/null +++ b/N1/3.txt @@ -0,0 +1,407 @@ +>From 4ccc6444efbdcc30680eff6b8f345511c306f3d7 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner <hannes@cmpxchg.org> +Date: Thu, 2 Mar 2017 09:58:03 -0500 +Subject: [PATCH 2/3] mm: workingset: tell cache transitions from workingset + thrashing + +Refaults happen during transitions between workingsets as well as +in-place thrashing. Knowing the difference between the two has a range +of applications, including measuring the impact of memory shortage on +the system performance, as well as the ability to smarter balance +pressure between the filesystem cache and the swap-backed workingset. + +During workingset transitions, inactive cache refaults and pushes out +established active cache. When that active cache isn't stale, however, +and also ends up refaulting, that's bonafide thrashing. + +Introduce a new page flag that tells on eviction whether the page has +been active or not in its lifetime. This bit is then stored in the +shadow entry, to classify refaults as transitioning or thrashing. + +Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> +--- + include/linux/mmzone.h | 1 + + include/linux/page-flags.h | 5 ++- + include/linux/swap.h | 2 +- + include/trace/events/mmflags.h | 1 + + mm/filemap.c | 9 ++-- + mm/huge_memory.c | 1 + + mm/memcontrol.c | 2 + + mm/migrate.c | 2 + + mm/swap_state.c | 1 + + mm/vmscan.c | 1 + + mm/vmstat.c | 1 + + mm/workingset.c | 96 +++++++++++++++++++++++++++--------------- + 12 files changed, 79 insertions(+), 43 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index fc14b8b3f6ce..b8726b501166 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -156,6 +156,7 @@ enum node_stat_item { + NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + WORKINGSET_REFAULT, + WORKINGSET_ACTIVATE, ++ WORKINGSET_RESTORE, + WORKINGSET_NODERECLAIM, + NR_ANON_MAPPED, /* Mapped anonymous pages */ + NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index d33e3280c8ad..f889af1a6aed 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -73,13 +73,14 @@ + */ + enum pageflags { + PG_locked, /* Page is locked. Don't touch. */ +- PG_error, + PG_referenced, + PG_uptodate, + PG_dirty, + PG_lru, + PG_active, ++ PG_workingset, + PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ ++ PG_error, + PG_slab, + PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ + PG_arch_1, +@@ -272,6 +273,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) + PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) + PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) + TESTCLEARFLAG(Active, active, PF_HEAD) ++PAGEFLAG(Workingset, workingset, PF_HEAD) ++ TESTCLEARFLAG(Workingset, workingset, PF_HEAD) + __PAGEFLAG(Slab, slab, PF_NO_TAIL) + __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) + PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ +diff --git a/include/linux/swap.h b/include/linux/swap.h +index d83d28e53e62..914a173beee1 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -252,7 +252,7 @@ struct swap_info_struct { + + /* linux/mm/workingset.c */ + void *workingset_eviction(struct address_space *mapping, struct page *page); +-bool workingset_refault(void *shadow); ++void workingset_refault(struct page *page, void *shadow); + void workingset_activation(struct page *page); + void workingset_update_node(struct radix_tree_node *node, void *private); + +diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h +index 8e50d01c645f..aac9eb272754 100644 +--- a/include/trace/events/mmflags.h ++++ b/include/trace/events/mmflags.h +@@ -90,6 +90,7 @@ + {1UL << PG_dirty, "dirty" }, \ + {1UL << PG_lru, "lru" }, \ + {1UL << PG_active, "active" }, \ ++ {1UL << PG_workingset, "workingset" }, \ + {1UL << PG_slab, "slab" }, \ + {1UL << PG_owner_priv_1, "owner_priv_1" }, \ + {1UL << PG_arch_1, "arch_1" }, \ +diff --git a/mm/filemap.c b/mm/filemap.c +index 65b4b6e7f7bd..da55a5693da9 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -823,12 +823,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + * data from the working set, only to cache data that will + * get overwritten with something else, is a waste of memory. + */ +- if (!(gfp_mask & __GFP_WRITE) && +- shadow && workingset_refault(shadow)) { +- SetPageActive(page); +- workingset_activation(page); +- } else +- ClearPageActive(page); ++ WARN_ON_ONCE(PageActive(page)); ++ if (!(gfp_mask & __GFP_WRITE) && shadow) ++ workingset_refault(page, shadow); + lru_cache_add(page); + } + return ret; +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 90731e3b7e58..b18ac8084c2a 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2239,6 +2239,7 @@ static void __split_huge_page_tail(struct page *head, int tail, + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | ++ (1L << PG_workingset) | + (1L << PG_locked) | + (1L << PG_unevictable) | + (1L << PG_dirty))); +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index e09741af816f..93b2eb063afd 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5274,6 +5274,8 @@ static int memory_stat_show(struct seq_file *m, void *v) + stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + stat[WORKINGSET_ACTIVATE]); ++ seq_printf(m, "workingset_restore %lu\n", ++ stat[WORKINGSET_RESTORE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + stat[WORKINGSET_NODERECLAIM]); + +diff --git a/mm/migrate.c b/mm/migrate.c +index e84eeb4e4356..48f4a79869ce 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -624,6 +624,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) + SetPageActive(newpage); + } else if (TestClearPageUnevictable(page)) + SetPageUnevictable(newpage); ++ if (PageWorkingset(page)) ++ SetPageWorkingset(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) +diff --git a/mm/swap_state.c b/mm/swap_state.c +index b68c93014f50..b39b3969be07 100644 +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -387,6 +387,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + /* + * Initiate read into locked page and return. + */ ++ SetPageWorkingset(new_page); + lru_cache_add_anon(new_page); + *new_page_allocated = true; + return new_page; +diff --git a/mm/vmscan.c b/mm/vmscan.c +index a1af041930a6..60357cd84c67 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2022,6 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan, + } + + ClearPageActive(page); /* we are de-activating */ ++ SetPageWorkingset(page); + list_add(&page->lru, &l_inactive); + } + +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 9a4441bbeef2..87ce53498828 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -956,6 +956,7 @@ const char * const vmstat_text[] = { + "nr_isolated_file", + "workingset_refault", + "workingset_activate", ++ "workingset_restore", + "workingset_nodereclaim", + "nr_anon_pages", + "nr_mapped", +diff --git a/mm/workingset.c b/mm/workingset.c +index 7119cd745ace..264f0498f2bc 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -120,7 +120,7 @@ + * the only thing eating into inactive list space is active pages. + * + * +- * Activating refaulting pages ++ * Refaulting inactive pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given +@@ -133,6 +133,10 @@ + * used less frequently than the refaulting page - or even not used at + * all anymore. + * ++ * That means if inactive cache is refaulting with a suitable refault ++ * distance, we assume the cache workingset is transitioning and put ++ * pressure on the current active list. ++ * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. +@@ -140,6 +144,14 @@ + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * ++ * Refaulting active pages ++ * ++ * If on the other hand the refaulting pages have recently been ++ * deactivated, it means that the active list is no longer protecting ++ * actively used cache from reclaim. The cache is NOT transitioning to ++ * a different workingset; the existing workingset is thrashing in the ++ * space allocated to the page cache. ++ * + * + * Implementation + * +@@ -155,8 +167,7 @@ + */ + + #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ +- NODES_SHIFT + \ +- MEM_CGROUP_ID_SHIFT) ++ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT) + #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) + + /* +@@ -169,23 +180,28 @@ + */ + static unsigned int bucket_order __read_mostly; + +-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) ++static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, ++ bool workingset) + { + eviction >>= bucket_order; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; ++ eviction = (eviction << 1) | workingset; + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + } + + static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, +- unsigned long *evictionp) ++ unsigned long *evictionp, bool *workingsetp) + { + unsigned long entry = (unsigned long)shadow; + int memcgid, nid; ++ bool workingset; + + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; ++ workingset = entry & 1; ++ entry >>= 1; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); +@@ -194,6 +210,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); + *evictionp = entry << bucket_order; ++ *workingsetp = workingset; + } + + /** +@@ -206,8 +223,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, + */ + void *workingset_eviction(struct address_space *mapping, struct page *page) + { +- struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); ++ struct mem_cgroup *memcg = page_memcg(page); + int memcgid = mem_cgroup_id(memcg); + unsigned long eviction; + struct lruvec *lruvec; +@@ -219,30 +236,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) + + lruvec = mem_cgroup_lruvec(pgdat, memcg); + eviction = atomic_long_inc_return(&lruvec->inactive_age); +- return pack_shadow(memcgid, pgdat, eviction); ++ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); + } + + /** + * workingset_refault - evaluate the refault of a previously evicted page ++ * @page: the freshly allocated replacement page + * @shadow: shadow entry of the evicted page + * + * Calculates and evaluates the refault distance of the previously + * evicted page in the context of the node it was allocated in. +- * +- * Returns %true if the page should be activated, %false otherwise. + */ +-bool workingset_refault(void *shadow) ++void workingset_refault(struct page *page, void *shadow) + { + unsigned long refault_distance; ++ struct pglist_data *pgdat; + unsigned long active_file; + struct mem_cgroup *memcg; + unsigned long eviction; + struct lruvec *lruvec; + unsigned long refault; +- struct pglist_data *pgdat; ++ bool workingset; + int memcgid; + +- unpack_shadow(shadow, &memcgid, &pgdat, &eviction); ++ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + + rcu_read_lock(); + /* +@@ -262,41 +279,50 @@ bool workingset_refault(void *shadow) + * configurations instead. + */ + memcg = mem_cgroup_from_id(memcgid); +- if (!mem_cgroup_disabled() && !memcg) { +- rcu_read_unlock(); +- return false; +- } ++ if (!mem_cgroup_disabled() && !memcg) ++ goto out; + lruvec = mem_cgroup_lruvec(pgdat, memcg); + refault = atomic_long_read(&lruvec->inactive_age); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); + + /* +- * The unsigned subtraction here gives an accurate distance +- * across inactive_age overflows in most cases. ++ * Calculate the refault distance + * +- * There is a special case: usually, shadow entries have a +- * short lifetime and are either refaulted or reclaimed along +- * with the inode before they get too old. But it is not +- * impossible for the inactive_age to lap a shadow entry in +- * the field, which can then can result in a false small +- * refault distance, leading to a false activation should this +- * old entry actually refault again. However, earlier kernels +- * used to deactivate unconditionally with *every* reclaim +- * invocation for the longest time, so the occasional +- * inappropriate activation leading to pressure on the active +- * list is not a problem. ++ * The unsigned subtraction here gives an accurate distance ++ * across inactive_age overflows in most cases. There is a ++ * special case: usually, shadow entries have a short lifetime ++ * and are either refaulted or reclaimed along with the inode ++ * before they get too old. But it is not impossible for the ++ * inactive_age to lap a shadow entry in the field, which can ++ * then can result in a false small refault distance, leading ++ * to a false activation should this old entry actually ++ * refault again. However, earlier kernels used to deactivate ++ * unconditionally with *every* reclaim invocation for the ++ * longest time, so the occasional inappropriate activation ++ * leading to pressure on the active list is not a problem. + */ + refault_distance = (refault - eviction) & EVICTION_MASK; + + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); + +- if (refault_distance <= active_file) { +- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); +- rcu_read_unlock(); +- return true; +- } ++ /* ++ * Compare the distance to the existing workingset size. We ++ * don't act on pages that couldn't stay resident even if all ++ * the memory was available to the page cache. ++ */ ++ if (refault_distance > active_file) ++ goto out; ++ ++ SetPageActive(page); ++ SetPageWorkingset(page); ++ atomic_long_inc(&lruvec->inactive_age); ++ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); ++ ++ /* Page was active prior to eviction */ ++ if (workingset) ++ inc_lruvec_state(lruvec, WORKINGSET_RESTORE); ++out: + rcu_read_unlock(); +- return false; + } + + /** +-- +2.14.1 diff --git a/N1/4.hdr b/N1/4.hdr new file mode 100644 index 0000000..59b4563 --- /dev/null +++ b/N1/4.hdr @@ -0,0 +1,2 @@ +Content-Type: text/x-diff; charset=us-ascii +Content-Disposition: attachment; filename="0003-mm-sched-memdelay-memory-health-interface-for-system.patch" diff --git a/N1/4.txt b/N1/4.txt new file mode 100644 index 0000000..a700eaf --- /dev/null +++ b/N1/4.txt @@ -0,0 +1,1209 @@ +>From c3e97f5daf99bcd54383eaab466c477dbb743dd9 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner <hannes@cmpxchg.org> +Date: Mon, 5 Jun 2017 16:07:22 -0400 +Subject: [PATCH 3/3] mm/sched: memdelay: memory health interface for systems + and workloads + +Linux doesn't have a useful metric to describe the memory health of a +system, a cgroup container, or individual tasks. + +When workloads are bigger than available memory, they spend a certain +amount of their time inside page reclaim, waiting on thrashing cache, +and swapping in. This has impact on latency, and depending on the CPU +capacity in the system can also translate to a decrease in throughput. + +While Linux exports some stats and counters for these events, it does +not quantify the true impact they have on throughput and latency. How +much of the execution time is spent unproductively? This is important +to know when sizing workloads to systems and containers. It also comes +in handy when evaluating the effectiveness and efficiency of the +kernel's memory management policies and heuristics. + +This patch implements a metric that quantifies memory pressure in a +unit that matters most to applications and does not rely on hardware +aspects to be meaningful: wallclock time lost while waiting on memory. + +Whenever a task is blocked on refaults, swapins, or direct reclaim, +the time it spends is accounted on the task level and aggregated into +a domain state along with other tasks on the system and cgroup level. + +Each task has a /proc/<pid>/memdelay file that lists the microseconds +the task has been delayed since it's been forked. That file can be +sampled periodically for recent delays, or before and after certain +operations to measure their memory-related latencies. + +On the system and cgroup-level, there are /proc/memdelay and +memory.memdelay, respectively, and their format is as such: + +$ cat /proc/memdelay +2489084 +41.61 47.28 29.66 +0.00 0.00 0.00 + +The first line shows the cumulative delay times of all tasks in the +domain - in this case, all tasks in the system cumulatively lost 2.49 +seconds due to memory delays. + +The second and third line show percentages spent in aggregate states +for the domain - system or cgroup - in a load average type format as +decaying averages over the last 1m, 5m, and 15m: + +The second line indicates the share of wall-time the domain spends in +a state where SOME tasks are delayed by memory while others are still +productive (runnable or iowait). This indicates a latency problem for +individual tasks, but since the CPU/IO capacity is still used, adding +more memory might not necessarily improve the domain's throughput. + +The third line indicates the share of wall-time the domain spends in a +state where ALL non-idle tasks are delayed by memory. In this state, +the domain is entirely unproductive due to a lack of memory. + +v2: +- fix active-delay condition when only other runnables, no iowait +- drop private lock from sched path, we can use the rq lock +- fix refault vs. simple lockwait detection +- drop ktime, we can use cpu_clock() + +XXX: +- eliminate redundant cgroup hierarchy walks in the scheduler + +Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> +--- + fs/proc/array.c | 8 ++ + fs/proc/base.c | 2 + + fs/proc/internal.h | 2 + + include/linux/memcontrol.h | 14 +++ + include/linux/memdelay.h | 182 +++++++++++++++++++++++++++++ + include/linux/sched.h | 8 ++ + kernel/cgroup/cgroup.c | 3 +- + kernel/fork.c | 4 + + kernel/sched/Makefile | 2 +- + kernel/sched/core.c | 27 +++++ + kernel/sched/memdelay.c | 118 +++++++++++++++++++ + mm/Makefile | 2 +- + mm/compaction.c | 4 + + mm/filemap.c | 11 ++ + mm/memcontrol.c | 25 ++++ + mm/memdelay.c | 285 +++++++++++++++++++++++++++++++++++++++++++++ + mm/page_alloc.c | 11 +- + mm/vmscan.c | 9 ++ + 18 files changed, 712 insertions(+), 5 deletions(-) + create mode 100644 include/linux/memdelay.h + create mode 100644 kernel/sched/memdelay.c + create mode 100644 mm/memdelay.c + +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 88c355574aa0..00e0e9aa3e70 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -611,6 +611,14 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + return 0; + } + ++int proc_pid_memdelay(struct seq_file *m, struct pid_namespace *ns, ++ struct pid *pid, struct task_struct *task) ++{ ++ seq_put_decimal_ull(m, "", task->memdelay_total); ++ seq_putc(m, '\n'); ++ return 0; ++} ++ + #ifdef CONFIG_PROC_CHILDREN + static struct pid * + get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 719c2e943ea1..19f194940c80 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -2916,6 +2916,7 @@ static const struct pid_entry tgid_base_stuff[] = { + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), + ONE("stat", S_IRUGO, proc_tgid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), ++ ONE("memdelay", S_IRUGO, proc_pid_memdelay), + REG("maps", S_IRUGO, proc_pid_maps_operations), + #ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), +@@ -3307,6 +3308,7 @@ static const struct pid_entry tid_base_stuff[] = { + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), + ONE("stat", S_IRUGO, proc_tid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), ++ ONE("memdelay", S_IRUGO, proc_pid_memdelay), + REG("maps", S_IRUGO, proc_tid_maps_operations), + #ifdef CONFIG_PROC_CHILDREN + REG("children", S_IRUGO, proc_tid_children_operations), +diff --git a/fs/proc/internal.h b/fs/proc/internal.h +index aa2b89071630..7ab706c316b8 100644 +--- a/fs/proc/internal.h ++++ b/fs/proc/internal.h +@@ -146,6 +146,8 @@ extern int proc_pid_status(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); + extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, + struct pid *, struct task_struct *); ++extern int proc_pid_memdelay(struct seq_file *, struct pid_namespace *, ++ struct pid *, struct task_struct *); + + /* + * base.c +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 9b15a4bcfa77..1f720d3090f7 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -30,6 +30,7 @@ + #include <linux/vmstat.h> + #include <linux/writeback.h> + #include <linux/page-flags.h> ++#include <linux/memdelay.h> + + struct mem_cgroup; + struct page; +@@ -183,6 +184,9 @@ struct mem_cgroup { + + unsigned long soft_limit; + ++ /* Memory delay measurement domain */ ++ struct memdelay_domain *memdelay_domain; ++ + /* vmpressure notifications */ + struct vmpressure vmpressure; + +@@ -728,6 +732,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, + return &pgdat->lruvec; + } + ++static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) ++{ ++ return NULL; ++} ++ + static inline bool mm_match_cgroup(struct mm_struct *mm, + struct mem_cgroup *memcg) + { +@@ -740,6 +749,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, + return true; + } + ++static inline struct mem_cgroup *mem_cgroup_from_task(struct task_struct *task) ++{ ++ return NULL; ++} ++ + static inline struct mem_cgroup * + mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, +diff --git a/include/linux/memdelay.h b/include/linux/memdelay.h +new file mode 100644 +index 000000000000..08ed4e4baedf +--- /dev/null ++++ b/include/linux/memdelay.h +@@ -0,0 +1,182 @@ ++#ifndef _LINUX_MEMDELAY_H ++#define _LINUX_MEMDELAY_H ++ ++#include <linux/spinlock_types.h> ++#include <linux/sched.h> ++ ++struct seq_file; ++struct css_set; ++ ++/* ++ * Task productivity states tracked by the scheduler ++ */ ++enum memdelay_task_state { ++ MTS_NONE, /* Idle/unqueued/untracked */ ++ MTS_IOWAIT, /* Waiting for IO, not memory delayed */ ++ MTS_RUNNABLE, /* On the runqueue, not memory delayed */ ++ MTS_DELAYED, /* Memory delayed, not running */ ++ MTS_DELAYED_ACTIVE, /* Memory delayed, actively running */ ++ NR_MEMDELAY_TASK_STATES, ++}; ++ ++/* ++ * System/cgroup delay state tracked by the VM, composed of the ++ * productivity states of all tasks inside the domain. ++ */ ++enum memdelay_domain_state { ++ MDS_NONE, /* No delayed tasks */ ++ MDS_SOME, /* Delayed tasks, working tasks */ ++ MDS_FULL, /* Delayed tasks, no working tasks */ ++ NR_MEMDELAY_DOMAIN_STATES, ++}; ++ ++struct memdelay_domain_cpu { ++ /* Task states of the domain on this CPU */ ++ int tasks[NR_MEMDELAY_TASK_STATES]; ++ ++ /* Delay state of the domain on this CPU */ ++ enum memdelay_domain_state state; ++ ++ /* Time of last state change */ ++ u64 state_start; ++}; ++ ++struct memdelay_domain { ++ /* Aggregate delayed time of all domain tasks */ ++ unsigned long aggregate; ++ ++ /* Per-CPU delay states in the domain */ ++ struct memdelay_domain_cpu __percpu *mdcs; ++ ++ /* Cumulative state times from all CPUs */ ++ unsigned long times[NR_MEMDELAY_DOMAIN_STATES]; ++ ++ /* Decaying state time averages over 1m, 5m, 15m */ ++ unsigned long period_expires; ++ unsigned long avg_full[3]; ++ unsigned long avg_some[3]; ++}; ++ ++/* mm/memdelay.c */ ++extern struct memdelay_domain memdelay_global_domain; ++void memdelay_init(void); ++void memdelay_task_change(struct task_struct *task, ++ enum memdelay_task_state old, ++ enum memdelay_task_state new); ++struct memdelay_domain *memdelay_domain_alloc(void); ++void memdelay_domain_free(struct memdelay_domain *md); ++int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md); ++ ++/* kernel/sched/memdelay.c */ ++void memdelay_enter(unsigned long *flags); ++void memdelay_leave(unsigned long *flags); ++ ++/** ++ * memdelay_schedule - note a context switch ++ * @prev: task scheduling out ++ * @next: task scheduling in ++ * ++ * A task switch doesn't affect the balance between delayed and ++ * productive tasks, but we have to update whether the delay is ++ * actively using the CPU or not. ++ */ ++static inline void memdelay_schedule(struct task_struct *prev, ++ struct task_struct *next) ++{ ++ if (prev->flags & PF_MEMDELAY) ++ memdelay_task_change(prev, MTS_DELAYED_ACTIVE, MTS_DELAYED); ++ ++ if (next->flags & PF_MEMDELAY) ++ memdelay_task_change(next, MTS_DELAYED, MTS_DELAYED_ACTIVE); ++} ++ ++/** ++ * memdelay_wakeup - note a task waking up ++ * @task: the task ++ * ++ * Notes an idle task becoming productive. Delayed tasks remain ++ * delayed even when they become runnable. ++ */ ++static inline void memdelay_wakeup(struct task_struct *task) ++{ ++ if (task->flags & PF_MEMDELAY) ++ return; ++ ++ if (task->in_iowait) ++ memdelay_task_change(task, MTS_IOWAIT, MTS_RUNNABLE); ++ else ++ memdelay_task_change(task, MTS_NONE, MTS_RUNNABLE); ++} ++ ++/** ++ * memdelay_wakeup - note a task going to sleep ++ * @task: the task ++ * ++ * Notes a working tasks becoming unproductive. Delayed tasks remain ++ * delayed. ++ */ ++static inline void memdelay_sleep(struct task_struct *task) ++{ ++ if (task->flags & PF_MEMDELAY) ++ return; ++ ++ if (task->in_iowait) ++ memdelay_task_change(task, MTS_RUNNABLE, MTS_IOWAIT); ++ else ++ memdelay_task_change(task, MTS_RUNNABLE, MTS_NONE); ++} ++ ++/** ++ * memdelay_del_add - track task movement between runqueues ++ * @task: the task ++ * @runnable: a runnable task is moved if %true, unqueued otherwise ++ * @add: task is being added if %true, removed otherwise ++ * ++ * Update the memdelay domain per-cpu states as tasks are being moved ++ * around the runqueues. ++ */ ++static inline void memdelay_del_add(struct task_struct *task, ++ bool runnable, bool add) ++{ ++ int state; ++ ++ if (task->flags & PF_MEMDELAY) ++ state = MTS_DELAYED; ++ else if (runnable) ++ state = MTS_RUNNABLE; ++ else if (task->in_iowait) ++ state = MTS_IOWAIT; ++ else ++ return; /* already MTS_NONE */ ++ ++ if (add) ++ memdelay_task_change(task, MTS_NONE, state); ++ else ++ memdelay_task_change(task, state, MTS_NONE); ++} ++ ++static inline void memdelay_del_runnable(struct task_struct *task) ++{ ++ memdelay_del_add(task, true, false); ++} ++ ++static inline void memdelay_add_runnable(struct task_struct *task) ++{ ++ memdelay_del_add(task, true, true); ++} ++ ++static inline void memdelay_del_sleeping(struct task_struct *task) ++{ ++ memdelay_del_add(task, false, false); ++} ++ ++static inline void memdelay_add_sleeping(struct task_struct *task) ++{ ++ memdelay_del_add(task, false, true); ++} ++ ++#ifdef CONFIG_CGROUPS ++void cgroup_move_task(struct task_struct *task, struct css_set *to); ++#endif ++ ++#endif /* _LINUX_MEMDELAY_H */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index c05ac5f5aa03..de15e3c8c43a 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -651,6 +651,7 @@ struct task_struct { + /* disallow userland-initiated cgroup migration */ + unsigned no_cgroup_migration:1; + #endif ++ unsigned memdelay_migrate_enqueue:1; + + unsigned long atomic_flags; /* Flags requiring atomic access. */ + +@@ -871,6 +872,12 @@ struct task_struct { + + struct io_context *io_context; + ++ u64 memdelay_start; ++ unsigned long memdelay_total; ++#ifdef CONFIG_DEBUG_VM ++ int memdelay_state; ++#endif ++ + /* Ptrace state: */ + unsigned long ptrace_message; + siginfo_t *last_siginfo; +@@ -1274,6 +1281,7 @@ extern struct pid *cad_pid; + #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ + #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ + #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ ++#define PF_MEMDELAY 0x01000000 /* Delayed due to lack of memory */ + #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ + #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ + #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ +diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c +index df2e0f14a95d..930aaef50396 100644 +--- a/kernel/cgroup/cgroup.c ++++ b/kernel/cgroup/cgroup.c +@@ -699,7 +699,8 @@ static void css_set_move_task(struct task_struct *task, + */ + WARN_ON_ONCE(task->flags & PF_EXITING); + +- rcu_assign_pointer(task->cgroups, to_cset); ++ cgroup_move_task(task, to_cset); ++ + list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : + &to_cset->tasks); + } +diff --git a/kernel/fork.c b/kernel/fork.c +index b7e9e57b71ea..96dd35393be9 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1208,6 +1208,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) + int retval; + + tsk->min_flt = tsk->maj_flt = 0; ++ tsk->memdelay_total = 0; ++#ifdef CONFIG_DEBUG_VM ++ tsk->memdelay_state = 0; ++#endif + tsk->nvcsw = tsk->nivcsw = 0; + #ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 53f0164ed362..84390fc42f60 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -17,7 +17,7 @@ endif + + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle_task.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o idle.o memdelay.o + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHEDSTATS) += stats.o +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 0869b20fba81..bf105c870da6 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -26,6 +26,7 @@ + #include <linux/profile.h> + #include <linux/security.h> + #include <linux/syscalls.h> ++#include <linux/memdelay.h> + + #include <asm/switch_to.h> + #include <asm/tlb.h> +@@ -759,6 +760,14 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); + ++ WARN_ON_ONCE(!(flags & ENQUEUE_WAKEUP) && p->memdelay_migrate_enqueue); ++ if (!(flags & ENQUEUE_WAKEUP) || p->memdelay_migrate_enqueue) { ++ memdelay_add_runnable(p); ++ p->memdelay_migrate_enqueue = 0; ++ } else { ++ memdelay_wakeup(p); ++ } ++ + p->sched_class->enqueue_task(rq, p, flags); + } + +@@ -770,6 +779,11 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); + ++ if (!(flags & DEQUEUE_SLEEP)) ++ memdelay_del_runnable(p); ++ else ++ memdelay_sleep(p); ++ + p->sched_class->dequeue_task(rq, p, flags); + } + +@@ -2044,7 +2058,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { ++ struct rq_flags rf; ++ struct rq *rq; ++ + wake_flags |= WF_MIGRATED; ++ ++ rq = __task_rq_lock(p, &rf); ++ memdelay_del_sleeping(p); ++ __task_rq_unlock(rq, &rf); ++ p->memdelay_migrate_enqueue = 1; ++ + set_task_cpu(p, cpu); + } + +@@ -3326,6 +3349,8 @@ static void __sched notrace __schedule(bool preempt) + rq->curr = next; + ++*switch_count; + ++ memdelay_schedule(prev, next); ++ + trace_sched_switch(preempt, prev, next); + + /* Also unlocks the rq: */ +@@ -5919,6 +5944,8 @@ void __init sched_init(void) + + init_schedstats(); + ++ memdelay_init(); ++ + scheduler_running = 1; + } + +diff --git a/kernel/sched/memdelay.c b/kernel/sched/memdelay.c +new file mode 100644 +index 000000000000..1d4813cd018a +--- /dev/null ++++ b/kernel/sched/memdelay.c +@@ -0,0 +1,118 @@ ++/* ++ * Memory delay metric ++ * ++ * Copyright (c) 2017 Facebook, Johannes Weiner ++ * ++ * This code quantifies and reports to userspace the wall-time impact ++ * of memory pressure on the system and memory-controlled cgroups. ++ */ ++ ++#include <linux/memdelay.h> ++#include <linux/cgroup.h> ++#include <linux/sched.h> ++ ++#include "sched.h" ++ ++/** ++ * memdelay_enter - mark the beginning of a memory delay section ++ * @flags: flags to handle nested memdelay sections ++ * ++ * Marks the calling task as being delayed due to a lack of memory, ++ * such as waiting for a workingset refault or performing reclaim. ++ */ ++void memdelay_enter(unsigned long *flags) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ *flags = current->flags & PF_MEMDELAY; ++ if (*flags) ++ return; ++ /* ++ * PF_MEMDELAY & accounting needs to be atomic wrt changes to ++ * the task's scheduling state and its domain association. ++ * Otherwise we could race with CPU or cgroup migration and ++ * misaccount. ++ */ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq, &rf); ++ ++ current->flags |= PF_MEMDELAY; ++ memdelay_task_change(current, MTS_RUNNABLE, MTS_DELAYED_ACTIVE); ++ ++ rq_unlock(rq, &rf); ++ local_irq_enable(); ++} ++ ++/** ++ * memdelay_leave - mark the end of a memory delay section ++ * @flags: flags to handle nested memdelay sections ++ * ++ * Marks the calling task as no longer delayed due to memory. ++ */ ++void memdelay_leave(unsigned long *flags) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (*flags) ++ return; ++ /* ++ * PF_MEMDELAY & accounting needs to be atomic wrt changes to ++ * the task's scheduling state and its domain association. ++ * Otherwise we could race with CPU or cgroup migration and ++ * misaccount. ++ */ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq, &rf); ++ ++ current->flags &= ~PF_MEMDELAY; ++ memdelay_task_change(current, MTS_DELAYED_ACTIVE, MTS_RUNNABLE); ++ ++ rq_unlock(rq, &rf); ++ local_irq_enable(); ++} ++ ++#ifdef CONFIG_CGROUPS ++/** ++ * cgroup_move_task - move task to a different cgroup ++ * @task: the task ++ * @to: the target css_set ++ * ++ * Move task to a new cgroup and safely migrate its associated ++ * delayed/working state between the different domains. ++ * ++ * This function acquires the task's rq lock to lock out concurrent ++ * changes to the task's scheduling state and - in case the task is ++ * running - concurrent changes to its delay state. ++ */ ++void cgroup_move_task(struct task_struct *task, struct css_set *to) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ int state; ++ ++ rq = task_rq_lock(task, &rf); ++ ++ if (task->flags & PF_MEMDELAY) ++ state = MTS_DELAYED + task_current(rq, task); ++ else if (task_on_rq_queued(task)) ++ state = MTS_RUNNABLE; ++ else if (task->in_iowait) ++ state = MTS_IOWAIT; ++ else ++ state = MTS_NONE; ++ ++ /* ++ * Lame to do this here, but the scheduler cannot be locked ++ * from the outside, so we move cgroups from inside sched/. ++ */ ++ memdelay_task_change(task, state, MTS_NONE); ++ rcu_assign_pointer(task->cgroups, to); ++ memdelay_task_change(task, MTS_NONE, state); ++ ++ task_rq_unlock(rq, task, &rf); ++} ++#endif /* CONFIG_CGROUPS */ +diff --git a/mm/Makefile b/mm/Makefile +index 411bd24d4a7c..c9bdbc5627e5 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ + mm_init.o mmu_context.o percpu.o slab_common.o \ + compaction.o vmacache.o swap_slots.o \ + interval_tree.o list_lru.o workingset.o \ +- debug.o $(mmu-y) ++ memdelay.o debug.o $(mmu-y) + + obj-y += init-mm.o + +diff --git a/mm/compaction.c b/mm/compaction.c +index fb548e4c7bd4..adf67de23fee 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -2040,11 +2040,15 @@ static int kcompactd(void *p) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { ++ unsigned long mdflags; ++ + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + ++ memdelay_enter(&mdflags); + kcompactd_do_work(pgdat); ++ memdelay_leave(&mdflags); + } + + return 0; +diff --git a/mm/filemap.c b/mm/filemap.c +index da55a5693da9..648418694405 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -36,6 +36,7 @@ + #include <linux/memcontrol.h> + #include <linux/cleancache.h> + #include <linux/rmap.h> ++#include <linux/memdelay.h> + #include "internal.h" + + #define CREATE_TRACE_POINTS +@@ -961,8 +962,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, + { + struct wait_page_queue wait_page; + wait_queue_entry_t *wait = &wait_page.wait; ++ unsigned long mdflags; ++ bool refault = false; + int ret = 0; + ++ if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { ++ memdelay_enter(&mdflags); ++ refault = true; ++ } ++ + init_wait(wait); + wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; + wait->func = wake_page_function; +@@ -1001,6 +1009,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, + + finish_wait(q, wait); + ++ if (refault) ++ memdelay_leave(&mdflags); ++ + /* + * A signal could leave PageWaiters set. Clearing it here if + * !waitqueue_active would be possible (by open-coding finish_wait), +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 93b2eb063afd..102f0f4d3f5c 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -65,6 +65,7 @@ + #include <linux/lockdep.h> + #include <linux/file.h> + #include <linux/tracehook.h> ++#include <linux/memdelay.h> + #include "internal.h" + #include <net/sock.h> + #include <net/ip.h> +@@ -3926,6 +3927,8 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + return ret; + } + ++static int memory_memdelay_show(struct seq_file *m, void *v); ++ + static struct cftype mem_cgroup_legacy_files[] = { + { + .name = "usage_in_bytes", +@@ -3993,6 +3996,10 @@ static struct cftype mem_cgroup_legacy_files[] = { + { + .name = "pressure_level", + }, ++ { ++ .name = "memdelay", ++ .seq_show = memory_memdelay_show, ++ }, + #ifdef CONFIG_NUMA + { + .name = "numa_stat", +@@ -4170,6 +4177,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) + + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); ++ memdelay_domain_free(memcg->memdelay_domain); + free_percpu(memcg->stat); + kfree(memcg); + } +@@ -4275,10 +4283,15 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) + + /* The following stuff does not apply to the root */ + if (!parent) { ++ memcg->memdelay_domain = &memdelay_global_domain; + root_mem_cgroup = memcg; + return &memcg->css; + } + ++ memcg->memdelay_domain = memdelay_domain_alloc(); ++ if (!memcg->memdelay_domain) ++ goto fail; ++ + error = memcg_online_kmem(memcg); + if (error) + goto fail; +@@ -5282,6 +5295,13 @@ static int memory_stat_show(struct seq_file *m, void *v) + return 0; + } + ++static int memory_memdelay_show(struct seq_file *m, void *v) ++{ ++ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); ++ ++ return memdelay_domain_show(m, memcg->memdelay_domain); ++} ++ + static struct cftype memory_files[] = { + { + .name = "current", +@@ -5317,6 +5337,11 @@ static struct cftype memory_files[] = { + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_stat_show, + }, ++ { ++ .name = "memdelay", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = memory_memdelay_show, ++ }, + { } /* terminate */ + }; + +diff --git a/mm/memdelay.c b/mm/memdelay.c +new file mode 100644 +index 000000000000..c43d6f7ba22a +--- /dev/null ++++ b/mm/memdelay.c +@@ -0,0 +1,285 @@ ++/* ++ * Memory delay metric ++ * ++ * Copyright (c) 2017 Facebook, Johannes Weiner ++ * ++ * This code quantifies and reports to userspace the wall-time impact ++ * of memory pressure on the system and memory-controlled cgroups. ++ */ ++ ++#include <linux/sched/loadavg.h> ++#include <linux/sched/clock.h> ++#include <linux/memcontrol.h> ++#include <linux/memdelay.h> ++#include <linux/seq_file.h> ++#include <linux/proc_fs.h> ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/slab.h> ++#include <linux/fs.h> ++ ++static DEFINE_PER_CPU(struct memdelay_domain_cpu, global_domain_cpus); ++ ++/* System-level keeping of memory delay statistics */ ++struct memdelay_domain memdelay_global_domain = { ++ .mdcs = &global_domain_cpus, ++}; ++ ++static void domain_init(struct memdelay_domain *md) ++{ ++ md->period_expires = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * memdelay_init - initialize the memdelay subsystem ++ * ++ * This needs to run before the scheduler starts queuing and ++ * scheduling tasks. ++ */ ++void __init memdelay_init(void) ++{ ++ domain_init(&memdelay_global_domain); ++} ++ ++static void domain_move_clock(struct memdelay_domain *md) ++{ ++ unsigned long expires = READ_ONCE(md->period_expires); ++ unsigned long none, some, full; ++ int missed_periods; ++ unsigned long next; ++ int i; ++ ++ if (time_before(jiffies, expires)) ++ return; ++ ++ missed_periods = 1 + (jiffies - expires) / LOAD_FREQ; ++ next = expires + (missed_periods * LOAD_FREQ); ++ ++ if (cmpxchg(&md->period_expires, expires, next) != expires) ++ return; ++ ++ none = xchg(&md->times[MDS_NONE], 0); ++ some = xchg(&md->times[MDS_SOME], 0); ++ full = xchg(&md->times[MDS_FULL], 0); ++ ++ for (i = 0; i < missed_periods; i++) { ++ unsigned long pct; ++ ++ pct = some * 100 / max(none + some + full, 1UL); ++ pct *= FIXED_1; ++ CALC_LOAD(md->avg_some[0], EXP_1, pct); ++ CALC_LOAD(md->avg_some[1], EXP_5, pct); ++ CALC_LOAD(md->avg_some[2], EXP_15, pct); ++ ++ pct = full * 100 / max(none + some + full, 1UL); ++ pct *= FIXED_1; ++ CALC_LOAD(md->avg_full[0], EXP_1, pct); ++ CALC_LOAD(md->avg_full[1], EXP_5, pct); ++ CALC_LOAD(md->avg_full[2], EXP_15, pct); ++ ++ none = some = full = 0; ++ } ++} ++ ++static void domain_cpu_update(struct memdelay_domain *md, int cpu, ++ enum memdelay_task_state old, ++ enum memdelay_task_state new) ++{ ++ enum memdelay_domain_state state; ++ struct memdelay_domain_cpu *mdc; ++ unsigned long delta; ++ u64 now; ++ ++ mdc = per_cpu_ptr(md->mdcs, cpu); ++ ++ if (old) { ++ WARN_ONCE(!mdc->tasks[old], "cpu=%d old=%d new=%d counter=%d\n", ++ cpu, old, new, mdc->tasks[old]); ++ mdc->tasks[old] -= 1; ++ } ++ if (new) ++ mdc->tasks[new] += 1; ++ ++ /* ++ * The domain is somewhat delayed when a number of tasks are ++ * delayed but there are still others running the workload. ++ * ++ * The domain is fully delayed when all non-idle tasks on the ++ * CPU are delayed, or when a delayed task is actively running ++ * and preventing productive tasks from making headway. ++ * ++ * The state times then add up over all CPUs in the domain: if ++ * the domain is fully blocked on one CPU and there is another ++ * one running the workload, the domain is considered fully ++ * blocked 50% of the time. ++ */ ++ if (mdc->tasks[MTS_DELAYED_ACTIVE] && !mdc->tasks[MTS_IOWAIT]) ++ state = MDS_FULL; ++ else if (mdc->tasks[MTS_DELAYED]) ++ state = (mdc->tasks[MTS_RUNNABLE] || mdc->tasks[MTS_IOWAIT]) ? ++ MDS_SOME : MDS_FULL; ++ else ++ state = MDS_NONE; ++ ++ if (mdc->state == state) ++ return; ++ ++ now = cpu_clock(cpu); ++ delta = (now - mdc->state_start) / NSEC_PER_USEC; ++ ++ domain_move_clock(md); ++ md->times[mdc->state] += delta; ++ ++ mdc->state = state; ++ mdc->state_start = now; ++} ++ ++static struct memdelay_domain *memcg_domain(struct mem_cgroup *memcg) ++{ ++#ifdef CONFIG_MEMCG ++ if (!mem_cgroup_disabled()) ++ return memcg->memdelay_domain; ++#endif ++ return &memdelay_global_domain; ++} ++ ++/** ++ * memdelay_task_change - note a task changing its delay/work state ++ * @task: the task changing state ++ * @old: old task state ++ * @new: new task state ++ * ++ * Updates the task's domain counters to reflect a change in the ++ * task's delayed/working state. ++ */ ++void memdelay_task_change(struct task_struct *task, ++ enum memdelay_task_state old, ++ enum memdelay_task_state new) ++{ ++ int cpu = task_cpu(task); ++ struct mem_cgroup *memcg; ++ unsigned long delay = 0; ++ ++#ifdef CONFIG_DEBUG_VM ++ WARN_ONCE(task->memdelay_state != old, ++ "cpu=%d task=%p state=%d (in_iowait=%d PF_MEMDELAYED=%d) old=%d new=%d\n", ++ cpu, task, task->memdelay_state, task->in_iowait, ++ !!(task->flags & PF_MEMDELAY), old, new); ++ task->memdelay_state = new; ++#endif ++ ++ /* Account when tasks are entering and leaving delays */ ++ if (old < MTS_DELAYED && new >= MTS_DELAYED) { ++ task->memdelay_start = cpu_clock(cpu); ++ } else if (old >= MTS_DELAYED && new < MTS_DELAYED) { ++ delay = (cpu_clock(cpu) - task->memdelay_start) / NSEC_PER_USEC; ++ task->memdelay_total += delay; ++ } ++ ++ /* Account domain state changes */ ++ rcu_read_lock(); ++ memcg = mem_cgroup_from_task(task); ++ do { ++ struct memdelay_domain *md; ++ ++ md = memcg_domain(memcg); ++ md->aggregate += delay; ++ domain_cpu_update(md, cpu, old, new); ++ } while (memcg && (memcg = parent_mem_cgroup(memcg))); ++ rcu_read_unlock(); ++}; ++ ++/** ++ * memdelay_domain_alloc - allocate a cgroup memory delay domain ++ */ ++struct memdelay_domain *memdelay_domain_alloc(void) ++{ ++ struct memdelay_domain *md; ++ ++ md = kzalloc(sizeof(*md), GFP_KERNEL); ++ if (!md) ++ return NULL; ++ md->mdcs = alloc_percpu(struct memdelay_domain_cpu); ++ if (!md->mdcs) { ++ kfree(md); ++ return NULL; ++ } ++ domain_init(md); ++ return md; ++} ++ ++/** ++ * memdelay_domain_free - free a cgroup memory delay domain ++ */ ++void memdelay_domain_free(struct memdelay_domain *md) ++{ ++ if (md) { ++ free_percpu(md->mdcs); ++ kfree(md); ++ } ++} ++ ++/** ++ * memdelay_domain_show - format memory delay domain stats to a seq_file ++ * @s: the seq_file ++ * @md: the memory domain ++ */ ++int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md) ++{ ++ domain_move_clock(md); ++ ++ seq_printf(s, "%lu\n", md->aggregate); ++ ++ seq_printf(s, "%lu.%02lu %lu.%02lu %lu.%02lu\n", ++ LOAD_INT(md->avg_some[0]), LOAD_FRAC(md->avg_some[0]), ++ LOAD_INT(md->avg_some[1]), LOAD_FRAC(md->avg_some[1]), ++ LOAD_INT(md->avg_some[2]), LOAD_FRAC(md->avg_some[2])); ++ ++ seq_printf(s, "%lu.%02lu %lu.%02lu %lu.%02lu\n", ++ LOAD_INT(md->avg_full[0]), LOAD_FRAC(md->avg_full[0]), ++ LOAD_INT(md->avg_full[1]), LOAD_FRAC(md->avg_full[1]), ++ LOAD_INT(md->avg_full[2]), LOAD_FRAC(md->avg_full[2])); ++ ++#ifdef CONFIG_DEBUG_VM ++ { ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ struct memdelay_domain_cpu *mdc; ++ ++ mdc = per_cpu_ptr(md->mdcs, cpu); ++ seq_printf(s, "%d %d %d %d\n", ++ mdc->tasks[MTS_IOWAIT], ++ mdc->tasks[MTS_RUNNABLE], ++ mdc->tasks[MTS_DELAYED], ++ mdc->tasks[MTS_DELAYED_ACTIVE]); ++ } ++ } ++#endif ++ ++ return 0; ++} ++ ++static int memdelay_show(struct seq_file *m, void *v) ++{ ++ return memdelay_domain_show(m, &memdelay_global_domain); ++} ++ ++static int memdelay_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, memdelay_show, NULL); ++} ++ ++static const struct file_operations memdelay_fops = { ++ .open = memdelay_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ ++static int __init memdelay_proc_init(void) ++{ ++ proc_create("memdelay", 0, NULL, &memdelay_fops); ++ return 0; ++} ++module_init(memdelay_proc_init); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 1423da8dd16f..d8d01e9df982 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -67,6 +67,7 @@ + #include <linux/memcontrol.h> + #include <linux/ftrace.h> + #include <linux/nmi.h> ++#include <linux/memdelay.h> + + #include <asm/sections.h> + #include <asm/tlbflush.h> +@@ -3364,16 +3365,19 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, const struct alloc_context *ac, + enum compact_priority prio, enum compact_result *compact_result) + { +- struct page *page; + unsigned int noreclaim_flag; ++ unsigned long mdflags; ++ struct page *page; + + if (!order) + return NULL; + ++ memdelay_enter(&mdflags); + noreclaim_flag = memalloc_noreclaim_save(); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + prio); + memalloc_noreclaim_restore(noreclaim_flag); ++ memdelay_leave(&mdflags); + + if (*compact_result <= COMPACT_INACTIVE) + return NULL; +@@ -3519,13 +3523,15 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) + { + struct reclaim_state reclaim_state; +- int progress; + unsigned int noreclaim_flag; ++ unsigned long mdflags; ++ int progress; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); ++ memdelay_enter(&mdflags); + noreclaim_flag = memalloc_noreclaim_save(); + lockdep_set_current_reclaim_state(gfp_mask); + reclaim_state.reclaimed_slab = 0; +@@ -3537,6 +3543,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, + current->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + memalloc_noreclaim_restore(noreclaim_flag); ++ memdelay_leave(&mdflags); + + cond_resched(); + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 60357cd84c67..1029305b9b3a 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -48,6 +48,7 @@ + #include <linux/prefetch.h> + #include <linux/printk.h> + #include <linux/dax.h> ++#include <linux/memdelay.h> + + #include <asm/tlbflush.h> + #include <asm/div64.h> +@@ -3098,6 +3099,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + { + struct zonelist *zonelist; + unsigned long nr_reclaimed; ++ unsigned long mdflags; + int nid; + unsigned int noreclaim_flag; + struct scan_control sc = { +@@ -3126,9 +3128,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + sc.gfp_mask, + sc.reclaim_idx); + ++ memdelay_enter(&mdflags); + noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + memalloc_noreclaim_restore(noreclaim_flag); ++ memdelay_leave(&mdflags); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + +@@ -3550,6 +3554,7 @@ static int kswapd(void *p) + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; + for ( ; ; ) { ++ unsigned long mdflags; + bool ret; + + alloc_order = reclaim_order = pgdat->kswapd_order; +@@ -3586,7 +3591,11 @@ static int kswapd(void *p) + */ + trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + alloc_order); ++ ++ memdelay_enter(&mdflags); + reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); ++ memdelay_leave(&mdflags); ++ + if (reclaim_order < alloc_order) + goto kswapd_try_sleep; + } +-- +2.14.1 diff --git a/a/content_digest b/N1/content_digest index da1f80c..3e678d4 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -45,5 +45,1741 @@ "\01:2\0" "fn\00001-sched-loadavg-consolidate-LOAD_INT-LOAD_FRAC-macros.patch\0" "b\0" + ">From d5ffeb4d9d65fcff1b7e50dbde8264b4c32824a5 Mon Sep 17 00:00:00 2001\n" + "From: Johannes Weiner <hannes@cmpxchg.org>\n" + "Date: Wed, 14 Jun 2017 11:12:05 -0400\n" + "Subject: [PATCH 1/3] sched/loadavg: consolidate LOAD_INT, LOAD_FRAC macros\n" + "\n" + "There are several identical definitions of those macros in places that\n" + "mess with fixed-point load averages. Provide an official version.\n" + "\n" + "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n" + "---\n" + " arch/powerpc/platforms/cell/spufs/sched.c | 3 ---\n" + " arch/s390/appldata/appldata_os.c | 4 ----\n" + " drivers/cpuidle/governors/menu.c | 4 ----\n" + " fs/proc/loadavg.c | 3 ---\n" + " include/linux/sched/loadavg.h | 3 +++\n" + " kernel/debug/kdb/kdb_main.c | 7 +------\n" + " 6 files changed, 4 insertions(+), 20 deletions(-)\n" + "\n" + "diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c\n" + "index 1fbb5da17dd2..de544070def3 100644\n" + "--- a/arch/powerpc/platforms/cell/spufs/sched.c\n" + "+++ b/arch/powerpc/platforms/cell/spufs/sched.c\n" + "@@ -1071,9 +1071,6 @@ void spuctx_switch_state(struct spu_context *ctx,\n" + " \t}\n" + " }\n" + " \n" + "-#define LOAD_INT(x) ((x) >> FSHIFT)\n" + "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n" + "-\n" + " static int show_spu_loadavg(struct seq_file *s, void *private)\n" + " {\n" + " \tint a, b, c;\n" + "diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c\n" + "index 45b3178200ab..a8aac17e1e82 100644\n" + "--- a/arch/s390/appldata/appldata_os.c\n" + "+++ b/arch/s390/appldata/appldata_os.c\n" + "@@ -24,10 +24,6 @@\n" + " \n" + " #include \"appldata.h\"\n" + " \n" + "-\n" + "-#define LOAD_INT(x) ((x) >> FSHIFT)\n" + "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n" + "-\n" + " /*\n" + " * OS data\n" + " *\n" + "diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c\n" + "index 61b64c2b2cb8..e215a2c10a61 100644\n" + "--- a/drivers/cpuidle/governors/menu.c\n" + "+++ b/drivers/cpuidle/governors/menu.c\n" + "@@ -132,10 +132,6 @@ struct menu_device {\n" + " \tint\t\tinterval_ptr;\n" + " };\n" + " \n" + "-\n" + "-#define LOAD_INT(x) ((x) >> FSHIFT)\n" + "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n" + "-\n" + " static inline int get_loadavg(unsigned long load)\n" + " {\n" + " \treturn LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;\n" + "diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c\n" + "index 983fce5c2418..111a25e4b088 100644\n" + "--- a/fs/proc/loadavg.c\n" + "+++ b/fs/proc/loadavg.c\n" + "@@ -9,9 +9,6 @@\n" + " #include <linux/seqlock.h>\n" + " #include <linux/time.h>\n" + " \n" + "-#define LOAD_INT(x) ((x) >> FSHIFT)\n" + "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n" + "-\n" + " static int loadavg_proc_show(struct seq_file *m, void *v)\n" + " {\n" + " \tunsigned long avnrun[3];\n" + "diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h\n" + "index 4264bc6b2c27..745483bb5cca 100644\n" + "--- a/include/linux/sched/loadavg.h\n" + "+++ b/include/linux/sched/loadavg.h\n" + "@@ -26,6 +26,9 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);\n" + " \tload += n*(FIXED_1-exp); \\\n" + " \tload >>= FSHIFT;\n" + " \n" + "+#define LOAD_INT(x) ((x) >> FSHIFT)\n" + "+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n" + "+\n" + " extern void calc_global_load(unsigned long ticks);\n" + " \n" + " #endif /* _LINUX_SCHED_LOADAVG_H */\n" + "diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c\n" + "index c8146d53ca67..2dddd25ccd7a 100644\n" + "--- a/kernel/debug/kdb/kdb_main.c\n" + "+++ b/kernel/debug/kdb/kdb_main.c\n" + "@@ -2571,16 +2571,11 @@ static int kdb_summary(int argc, const char **argv)\n" + " \t}\n" + " \tkdb_printf(\"%02ld:%02ld\\n\", val.uptime/(60*60), (val.uptime/60)%60);\n" + " \n" + "-\t/* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */\n" + "-\n" + "-#define LOAD_INT(x) ((x) >> FSHIFT)\n" + "-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)\n" + " \tkdb_printf(\"load avg %ld.%02ld %ld.%02ld %ld.%02ld\\n\",\n" + " \t\tLOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),\n" + " \t\tLOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),\n" + " \t\tLOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));\n" + "-#undef LOAD_INT\n" + "-#undef LOAD_FRAC\n" + "+\n" + " \t/* Display in kilobytes */\n" + " #define K(x) ((x) << (PAGE_SHIFT - 10))\n" + " \tkdb_printf(\"\\nMemTotal: %8lu kB\\nMemFree: %8lu kB\\n\"\n" + "-- \n" + 2.14.1 + "\01:3\0" + "fn\00002-mm-workingset-tell-cache-transitions-from-workingset.patch\0" + "b\0" + ">From 4ccc6444efbdcc30680eff6b8f345511c306f3d7 Mon Sep 17 00:00:00 2001\n" + "From: Johannes Weiner <hannes@cmpxchg.org>\n" + "Date: Thu, 2 Mar 2017 09:58:03 -0500\n" + "Subject: [PATCH 2/3] mm: workingset: tell cache transitions from workingset\n" + " thrashing\n" + "\n" + "Refaults happen during transitions between workingsets as well as\n" + "in-place thrashing. Knowing the difference between the two has a range\n" + "of applications, including measuring the impact of memory shortage on\n" + "the system performance, as well as the ability to smarter balance\n" + "pressure between the filesystem cache and the swap-backed workingset.\n" + "\n" + "During workingset transitions, inactive cache refaults and pushes out\n" + "established active cache. When that active cache isn't stale, however,\n" + "and also ends up refaulting, that's bonafide thrashing.\n" + "\n" + "Introduce a new page flag that tells on eviction whether the page has\n" + "been active or not in its lifetime. This bit is then stored in the\n" + "shadow entry, to classify refaults as transitioning or thrashing.\n" + "\n" + "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n" + "---\n" + " include/linux/mmzone.h | 1 +\n" + " include/linux/page-flags.h | 5 ++-\n" + " include/linux/swap.h | 2 +-\n" + " include/trace/events/mmflags.h | 1 +\n" + " mm/filemap.c | 9 ++--\n" + " mm/huge_memory.c | 1 +\n" + " mm/memcontrol.c | 2 +\n" + " mm/migrate.c | 2 +\n" + " mm/swap_state.c | 1 +\n" + " mm/vmscan.c | 1 +\n" + " mm/vmstat.c | 1 +\n" + " mm/workingset.c | 96 +++++++++++++++++++++++++++---------------\n" + " 12 files changed, 79 insertions(+), 43 deletions(-)\n" + "\n" + "diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h\n" + "index fc14b8b3f6ce..b8726b501166 100644\n" + "--- a/include/linux/mmzone.h\n" + "+++ b/include/linux/mmzone.h\n" + "@@ -156,6 +156,7 @@ enum node_stat_item {\n" + " \tNR_ISOLATED_FILE,\t/* Temporary isolated pages from file lru */\n" + " \tWORKINGSET_REFAULT,\n" + " \tWORKINGSET_ACTIVATE,\n" + "+\tWORKINGSET_RESTORE,\n" + " \tWORKINGSET_NODERECLAIM,\n" + " \tNR_ANON_MAPPED,\t/* Mapped anonymous pages */\n" + " \tNR_FILE_MAPPED,\t/* pagecache pages mapped into pagetables.\n" + "diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h\n" + "index d33e3280c8ad..f889af1a6aed 100644\n" + "--- a/include/linux/page-flags.h\n" + "+++ b/include/linux/page-flags.h\n" + "@@ -73,13 +73,14 @@\n" + " */\n" + " enum pageflags {\n" + " \tPG_locked,\t\t/* Page is locked. Don't touch. */\n" + "-\tPG_error,\n" + " \tPG_referenced,\n" + " \tPG_uptodate,\n" + " \tPG_dirty,\n" + " \tPG_lru,\n" + " \tPG_active,\n" + "+\tPG_workingset,\n" + " \tPG_waiters,\t\t/* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as \"PG_locked\" */\n" + "+\tPG_error,\n" + " \tPG_slab,\n" + " \tPG_owner_priv_1,\t/* Owner use. If pagecache, fs may use*/\n" + " \tPG_arch_1,\n" + "@@ -272,6 +273,8 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)\n" + " PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)\n" + " PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)\n" + " \tTESTCLEARFLAG(Active, active, PF_HEAD)\n" + "+PAGEFLAG(Workingset, workingset, PF_HEAD)\n" + "+\tTESTCLEARFLAG(Workingset, workingset, PF_HEAD)\n" + " __PAGEFLAG(Slab, slab, PF_NO_TAIL)\n" + " __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL)\n" + " PAGEFLAG(Checked, checked, PF_NO_COMPOUND)\t /* Used by some filesystems */\n" + "diff --git a/include/linux/swap.h b/include/linux/swap.h\n" + "index d83d28e53e62..914a173beee1 100644\n" + "--- a/include/linux/swap.h\n" + "+++ b/include/linux/swap.h\n" + "@@ -252,7 +252,7 @@ struct swap_info_struct {\n" + " \n" + " /* linux/mm/workingset.c */\n" + " void *workingset_eviction(struct address_space *mapping, struct page *page);\n" + "-bool workingset_refault(void *shadow);\n" + "+void workingset_refault(struct page *page, void *shadow);\n" + " void workingset_activation(struct page *page);\n" + " void workingset_update_node(struct radix_tree_node *node, void *private);\n" + " \n" + "diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h\n" + "index 8e50d01c645f..aac9eb272754 100644\n" + "--- a/include/trace/events/mmflags.h\n" + "+++ b/include/trace/events/mmflags.h\n" + "@@ -90,6 +90,7 @@\n" + " \t{1UL << PG_dirty,\t\t\"dirty\"\t\t},\t\t\\\n" + " \t{1UL << PG_lru,\t\t\t\"lru\"\t\t},\t\t\\\n" + " \t{1UL << PG_active,\t\t\"active\"\t},\t\t\\\n" + "+\t{1UL << PG_workingset,\t\t\"workingset\"\t},\t\t\\\n" + " \t{1UL << PG_slab,\t\t\"slab\"\t\t},\t\t\\\n" + " \t{1UL << PG_owner_priv_1,\t\"owner_priv_1\"\t},\t\t\\\n" + " \t{1UL << PG_arch_1,\t\t\"arch_1\"\t},\t\t\\\n" + "diff --git a/mm/filemap.c b/mm/filemap.c\n" + "index 65b4b6e7f7bd..da55a5693da9 100644\n" + "--- a/mm/filemap.c\n" + "+++ b/mm/filemap.c\n" + "@@ -823,12 +823,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,\n" + " \t\t * data from the working set, only to cache data that will\n" + " \t\t * get overwritten with something else, is a waste of memory.\n" + " \t\t */\n" + "-\t\tif (!(gfp_mask & __GFP_WRITE) &&\n" + "-\t\t shadow && workingset_refault(shadow)) {\n" + "-\t\t\tSetPageActive(page);\n" + "-\t\t\tworkingset_activation(page);\n" + "-\t\t} else\n" + "-\t\t\tClearPageActive(page);\n" + "+\t\tWARN_ON_ONCE(PageActive(page));\n" + "+\t\tif (!(gfp_mask & __GFP_WRITE) && shadow)\n" + "+\t\t\tworkingset_refault(page, shadow);\n" + " \t\tlru_cache_add(page);\n" + " \t}\n" + " \treturn ret;\n" + "diff --git a/mm/huge_memory.c b/mm/huge_memory.c\n" + "index 90731e3b7e58..b18ac8084c2a 100644\n" + "--- a/mm/huge_memory.c\n" + "+++ b/mm/huge_memory.c\n" + "@@ -2239,6 +2239,7 @@ static void __split_huge_page_tail(struct page *head, int tail,\n" + " \t\t\t (1L << PG_mlocked) |\n" + " \t\t\t (1L << PG_uptodate) |\n" + " \t\t\t (1L << PG_active) |\n" + "+\t\t\t (1L << PG_workingset) |\n" + " \t\t\t (1L << PG_locked) |\n" + " \t\t\t (1L << PG_unevictable) |\n" + " \t\t\t (1L << PG_dirty)));\n" + "diff --git a/mm/memcontrol.c b/mm/memcontrol.c\n" + "index e09741af816f..93b2eb063afd 100644\n" + "--- a/mm/memcontrol.c\n" + "+++ b/mm/memcontrol.c\n" + "@@ -5274,6 +5274,8 @@ static int memory_stat_show(struct seq_file *m, void *v)\n" + " \t\t stat[WORKINGSET_REFAULT]);\n" + " \tseq_printf(m, \"workingset_activate %lu\\n\",\n" + " \t\t stat[WORKINGSET_ACTIVATE]);\n" + "+\tseq_printf(m, \"workingset_restore %lu\\n\",\n" + "+\t\t stat[WORKINGSET_RESTORE]);\n" + " \tseq_printf(m, \"workingset_nodereclaim %lu\\n\",\n" + " \t\t stat[WORKINGSET_NODERECLAIM]);\n" + " \n" + "diff --git a/mm/migrate.c b/mm/migrate.c\n" + "index e84eeb4e4356..48f4a79869ce 100644\n" + "--- a/mm/migrate.c\n" + "+++ b/mm/migrate.c\n" + "@@ -624,6 +624,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)\n" + " \t\tSetPageActive(newpage);\n" + " \t} else if (TestClearPageUnevictable(page))\n" + " \t\tSetPageUnevictable(newpage);\n" + "+\tif (PageWorkingset(page))\n" + "+\t\tSetPageWorkingset(newpage);\n" + " \tif (PageChecked(page))\n" + " \t\tSetPageChecked(newpage);\n" + " \tif (PageMappedToDisk(page))\n" + "diff --git a/mm/swap_state.c b/mm/swap_state.c\n" + "index b68c93014f50..b39b3969be07 100644\n" + "--- a/mm/swap_state.c\n" + "+++ b/mm/swap_state.c\n" + "@@ -387,6 +387,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,\n" + " \t\t\t/*\n" + " \t\t\t * Initiate read into locked page and return.\n" + " \t\t\t */\n" + "+\t\t\tSetPageWorkingset(new_page);\n" + " \t\t\tlru_cache_add_anon(new_page);\n" + " \t\t\t*new_page_allocated = true;\n" + " \t\t\treturn new_page;\n" + "diff --git a/mm/vmscan.c b/mm/vmscan.c\n" + "index a1af041930a6..60357cd84c67 100644\n" + "--- a/mm/vmscan.c\n" + "+++ b/mm/vmscan.c\n" + "@@ -2022,6 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan,\n" + " \t\t}\n" + " \n" + " \t\tClearPageActive(page);\t/* we are de-activating */\n" + "+\t\tSetPageWorkingset(page);\n" + " \t\tlist_add(&page->lru, &l_inactive);\n" + " \t}\n" + " \n" + "diff --git a/mm/vmstat.c b/mm/vmstat.c\n" + "index 9a4441bbeef2..87ce53498828 100644\n" + "--- a/mm/vmstat.c\n" + "+++ b/mm/vmstat.c\n" + "@@ -956,6 +956,7 @@ const char * const vmstat_text[] = {\n" + " \t\"nr_isolated_file\",\n" + " \t\"workingset_refault\",\n" + " \t\"workingset_activate\",\n" + "+\t\"workingset_restore\",\n" + " \t\"workingset_nodereclaim\",\n" + " \t\"nr_anon_pages\",\n" + " \t\"nr_mapped\",\n" + "diff --git a/mm/workingset.c b/mm/workingset.c\n" + "index 7119cd745ace..264f0498f2bc 100644\n" + "--- a/mm/workingset.c\n" + "+++ b/mm/workingset.c\n" + "@@ -120,7 +120,7 @@\n" + " * the only thing eating into inactive list space is active pages.\n" + " *\n" + " *\n" + "- *\t\tActivating refaulting pages\n" + "+ *\t\tRefaulting inactive pages\n" + " *\n" + " * All that is known about the active list is that the pages have been\n" + " * accessed more than once in the past. This means that at any given\n" + "@@ -133,6 +133,10 @@\n" + " * used less frequently than the refaulting page - or even not used at\n" + " * all anymore.\n" + " *\n" + "+ * That means if inactive cache is refaulting with a suitable refault\n" + "+ * distance, we assume the cache workingset is transitioning and put\n" + "+ * pressure on the current active list.\n" + "+ *\n" + " * If this is wrong and demotion kicks in, the pages which are truly\n" + " * used more frequently will be reactivated while the less frequently\n" + " * used once will be evicted from memory.\n" + "@@ -140,6 +144,14 @@\n" + " * But if this is right, the stale pages will be pushed out of memory\n" + " * and the used pages get to stay in cache.\n" + " *\n" + "+ *\t\tRefaulting active pages\n" + "+ *\n" + "+ * If on the other hand the refaulting pages have recently been\n" + "+ * deactivated, it means that the active list is no longer protecting\n" + "+ * actively used cache from reclaim. The cache is NOT transitioning to\n" + "+ * a different workingset; the existing workingset is thrashing in the\n" + "+ * space allocated to the page cache.\n" + "+ *\n" + " *\n" + " *\t\tImplementation\n" + " *\n" + "@@ -155,8 +167,7 @@\n" + " */\n" + " \n" + " #define EVICTION_SHIFT\t(RADIX_TREE_EXCEPTIONAL_ENTRY + \\\n" + "-\t\t\t NODES_SHIFT +\t\\\n" + "-\t\t\t MEM_CGROUP_ID_SHIFT)\n" + "+\t\t\t 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)\n" + " #define EVICTION_MASK\t(~0UL >> EVICTION_SHIFT)\n" + " \n" + " /*\n" + "@@ -169,23 +180,28 @@\n" + " */\n" + " static unsigned int bucket_order __read_mostly;\n" + " \n" + "-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)\n" + "+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,\n" + "+\t\t\t bool workingset)\n" + " {\n" + " \teviction >>= bucket_order;\n" + " \teviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;\n" + " \teviction = (eviction << NODES_SHIFT) | pgdat->node_id;\n" + "+\teviction = (eviction << 1) | workingset;\n" + " \teviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);\n" + " \n" + " \treturn (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);\n" + " }\n" + " \n" + " static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,\n" + "-\t\t\t unsigned long *evictionp)\n" + "+\t\t\t unsigned long *evictionp, bool *workingsetp)\n" + " {\n" + " \tunsigned long entry = (unsigned long)shadow;\n" + " \tint memcgid, nid;\n" + "+\tbool workingset;\n" + " \n" + " \tentry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;\n" + "+\tworkingset = entry & 1;\n" + "+\tentry >>= 1;\n" + " \tnid = entry & ((1UL << NODES_SHIFT) - 1);\n" + " \tentry >>= NODES_SHIFT;\n" + " \tmemcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);\n" + "@@ -194,6 +210,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,\n" + " \t*memcgidp = memcgid;\n" + " \t*pgdat = NODE_DATA(nid);\n" + " \t*evictionp = entry << bucket_order;\n" + "+\t*workingsetp = workingset;\n" + " }\n" + " \n" + " /**\n" + "@@ -206,8 +223,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,\n" + " */\n" + " void *workingset_eviction(struct address_space *mapping, struct page *page)\n" + " {\n" + "-\tstruct mem_cgroup *memcg = page_memcg(page);\n" + " \tstruct pglist_data *pgdat = page_pgdat(page);\n" + "+\tstruct mem_cgroup *memcg = page_memcg(page);\n" + " \tint memcgid = mem_cgroup_id(memcg);\n" + " \tunsigned long eviction;\n" + " \tstruct lruvec *lruvec;\n" + "@@ -219,30 +236,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)\n" + " \n" + " \tlruvec = mem_cgroup_lruvec(pgdat, memcg);\n" + " \teviction = atomic_long_inc_return(&lruvec->inactive_age);\n" + "-\treturn pack_shadow(memcgid, pgdat, eviction);\n" + "+\treturn pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));\n" + " }\n" + " \n" + " /**\n" + " * workingset_refault - evaluate the refault of a previously evicted page\n" + "+ * @page: the freshly allocated replacement page\n" + " * @shadow: shadow entry of the evicted page\n" + " *\n" + " * Calculates and evaluates the refault distance of the previously\n" + " * evicted page in the context of the node it was allocated in.\n" + "- *\n" + "- * Returns %true if the page should be activated, %false otherwise.\n" + " */\n" + "-bool workingset_refault(void *shadow)\n" + "+void workingset_refault(struct page *page, void *shadow)\n" + " {\n" + " \tunsigned long refault_distance;\n" + "+\tstruct pglist_data *pgdat;\n" + " \tunsigned long active_file;\n" + " \tstruct mem_cgroup *memcg;\n" + " \tunsigned long eviction;\n" + " \tstruct lruvec *lruvec;\n" + " \tunsigned long refault;\n" + "-\tstruct pglist_data *pgdat;\n" + "+\tbool workingset;\n" + " \tint memcgid;\n" + " \n" + "-\tunpack_shadow(shadow, &memcgid, &pgdat, &eviction);\n" + "+\tunpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);\n" + " \n" + " \trcu_read_lock();\n" + " \t/*\n" + "@@ -262,41 +279,50 @@ bool workingset_refault(void *shadow)\n" + " \t * configurations instead.\n" + " \t */\n" + " \tmemcg = mem_cgroup_from_id(memcgid);\n" + "-\tif (!mem_cgroup_disabled() && !memcg) {\n" + "-\t\trcu_read_unlock();\n" + "-\t\treturn false;\n" + "-\t}\n" + "+\tif (!mem_cgroup_disabled() && !memcg)\n" + "+\t\tgoto out;\n" + " \tlruvec = mem_cgroup_lruvec(pgdat, memcg);\n" + " \trefault = atomic_long_read(&lruvec->inactive_age);\n" + " \tactive_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);\n" + " \n" + " \t/*\n" + "-\t * The unsigned subtraction here gives an accurate distance\n" + "-\t * across inactive_age overflows in most cases.\n" + "+\t * Calculate the refault distance\n" + " \t *\n" + "-\t * There is a special case: usually, shadow entries have a\n" + "-\t * short lifetime and are either refaulted or reclaimed along\n" + "-\t * with the inode before they get too old. But it is not\n" + "-\t * impossible for the inactive_age to lap a shadow entry in\n" + "-\t * the field, which can then can result in a false small\n" + "-\t * refault distance, leading to a false activation should this\n" + "-\t * old entry actually refault again. However, earlier kernels\n" + "-\t * used to deactivate unconditionally with *every* reclaim\n" + "-\t * invocation for the longest time, so the occasional\n" + "-\t * inappropriate activation leading to pressure on the active\n" + "-\t * list is not a problem.\n" + "+\t * The unsigned subtraction here gives an accurate distance\n" + "+\t * across inactive_age overflows in most cases. There is a\n" + "+\t * special case: usually, shadow entries have a short lifetime\n" + "+\t * and are either refaulted or reclaimed along with the inode\n" + "+\t * before they get too old. But it is not impossible for the\n" + "+\t * inactive_age to lap a shadow entry in the field, which can\n" + "+\t * then can result in a false small refault distance, leading\n" + "+\t * to a false activation should this old entry actually\n" + "+\t * refault again. However, earlier kernels used to deactivate\n" + "+\t * unconditionally with *every* reclaim invocation for the\n" + "+\t * longest time, so the occasional inappropriate activation\n" + "+\t * leading to pressure on the active list is not a problem.\n" + " \t */\n" + " \trefault_distance = (refault - eviction) & EVICTION_MASK;\n" + " \n" + " \tinc_lruvec_state(lruvec, WORKINGSET_REFAULT);\n" + " \n" + "-\tif (refault_distance <= active_file) {\n" + "-\t\tinc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);\n" + "-\t\trcu_read_unlock();\n" + "-\t\treturn true;\n" + "-\t}\n" + "+\t/*\n" + "+\t * Compare the distance to the existing workingset size. We\n" + "+\t * don't act on pages that couldn't stay resident even if all\n" + "+\t * the memory was available to the page cache.\n" + "+\t */\n" + "+\tif (refault_distance > active_file)\n" + "+\t\tgoto out;\n" + "+\n" + "+\tSetPageActive(page);\n" + "+\tSetPageWorkingset(page);\n" + "+\tatomic_long_inc(&lruvec->inactive_age);\n" + "+\tinc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);\n" + "+\n" + "+\t/* Page was active prior to eviction */\n" + "+\tif (workingset)\n" + "+\t\tinc_lruvec_state(lruvec, WORKINGSET_RESTORE);\n" + "+out:\n" + " \trcu_read_unlock();\n" + "-\treturn false;\n" + " }\n" + " \n" + " /**\n" + "-- \n" + 2.14.1 + "\01:4\0" + "fn\00003-mm-sched-memdelay-memory-health-interface-for-system.patch\0" + "b\0" + ">From c3e97f5daf99bcd54383eaab466c477dbb743dd9 Mon Sep 17 00:00:00 2001\n" + "From: Johannes Weiner <hannes@cmpxchg.org>\n" + "Date: Mon, 5 Jun 2017 16:07:22 -0400\n" + "Subject: [PATCH 3/3] mm/sched: memdelay: memory health interface for systems\n" + " and workloads\n" + "\n" + "Linux doesn't have a useful metric to describe the memory health of a\n" + "system, a cgroup container, or individual tasks.\n" + "\n" + "When workloads are bigger than available memory, they spend a certain\n" + "amount of their time inside page reclaim, waiting on thrashing cache,\n" + "and swapping in. This has impact on latency, and depending on the CPU\n" + "capacity in the system can also translate to a decrease in throughput.\n" + "\n" + "While Linux exports some stats and counters for these events, it does\n" + "not quantify the true impact they have on throughput and latency. How\n" + "much of the execution time is spent unproductively? This is important\n" + "to know when sizing workloads to systems and containers. It also comes\n" + "in handy when evaluating the effectiveness and efficiency of the\n" + "kernel's memory management policies and heuristics.\n" + "\n" + "This patch implements a metric that quantifies memory pressure in a\n" + "unit that matters most to applications and does not rely on hardware\n" + "aspects to be meaningful: wallclock time lost while waiting on memory.\n" + "\n" + "Whenever a task is blocked on refaults, swapins, or direct reclaim,\n" + "the time it spends is accounted on the task level and aggregated into\n" + "a domain state along with other tasks on the system and cgroup level.\n" + "\n" + "Each task has a /proc/<pid>/memdelay file that lists the microseconds\n" + "the task has been delayed since it's been forked. That file can be\n" + "sampled periodically for recent delays, or before and after certain\n" + "operations to measure their memory-related latencies.\n" + "\n" + "On the system and cgroup-level, there are /proc/memdelay and\n" + "memory.memdelay, respectively, and their format is as such:\n" + "\n" + "$ cat /proc/memdelay\n" + "2489084\n" + "41.61 47.28 29.66\n" + "0.00 0.00 0.00\n" + "\n" + "The first line shows the cumulative delay times of all tasks in the\n" + "domain - in this case, all tasks in the system cumulatively lost 2.49\n" + "seconds due to memory delays.\n" + "\n" + "The second and third line show percentages spent in aggregate states\n" + "for the domain - system or cgroup - in a load average type format as\n" + "decaying averages over the last 1m, 5m, and 15m:\n" + "\n" + "The second line indicates the share of wall-time the domain spends in\n" + "a state where SOME tasks are delayed by memory while others are still\n" + "productive (runnable or iowait). This indicates a latency problem for\n" + "individual tasks, but since the CPU/IO capacity is still used, adding\n" + "more memory might not necessarily improve the domain's throughput.\n" + "\n" + "The third line indicates the share of wall-time the domain spends in a\n" + "state where ALL non-idle tasks are delayed by memory. In this state,\n" + "the domain is entirely unproductive due to a lack of memory.\n" + "\n" + "v2:\n" + "- fix active-delay condition when only other runnables, no iowait\n" + "- drop private lock from sched path, we can use the rq lock\n" + "- fix refault vs. simple lockwait detection\n" + "- drop ktime, we can use cpu_clock()\n" + "\n" + "XXX:\n" + "- eliminate redundant cgroup hierarchy walks in the scheduler\n" + "\n" + "Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>\n" + "---\n" + " fs/proc/array.c | 8 ++\n" + " fs/proc/base.c | 2 +\n" + " fs/proc/internal.h | 2 +\n" + " include/linux/memcontrol.h | 14 +++\n" + " include/linux/memdelay.h | 182 +++++++++++++++++++++++++++++\n" + " include/linux/sched.h | 8 ++\n" + " kernel/cgroup/cgroup.c | 3 +-\n" + " kernel/fork.c | 4 +\n" + " kernel/sched/Makefile | 2 +-\n" + " kernel/sched/core.c | 27 +++++\n" + " kernel/sched/memdelay.c | 118 +++++++++++++++++++\n" + " mm/Makefile | 2 +-\n" + " mm/compaction.c | 4 +\n" + " mm/filemap.c | 11 ++\n" + " mm/memcontrol.c | 25 ++++\n" + " mm/memdelay.c | 285 +++++++++++++++++++++++++++++++++++++++++++++\n" + " mm/page_alloc.c | 11 +-\n" + " mm/vmscan.c | 9 ++\n" + " 18 files changed, 712 insertions(+), 5 deletions(-)\n" + " create mode 100644 include/linux/memdelay.h\n" + " create mode 100644 kernel/sched/memdelay.c\n" + " create mode 100644 mm/memdelay.c\n" + "\n" + "diff --git a/fs/proc/array.c b/fs/proc/array.c\n" + "index 88c355574aa0..00e0e9aa3e70 100644\n" + "--- a/fs/proc/array.c\n" + "+++ b/fs/proc/array.c\n" + "@@ -611,6 +611,14 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,\n" + " \treturn 0;\n" + " }\n" + " \n" + "+int proc_pid_memdelay(struct seq_file *m, struct pid_namespace *ns,\n" + "+\t\t struct pid *pid, struct task_struct *task)\n" + "+{\n" + "+\tseq_put_decimal_ull(m, \"\", task->memdelay_total);\n" + "+\tseq_putc(m, '\\n');\n" + "+\treturn 0;\n" + "+}\n" + "+\n" + " #ifdef CONFIG_PROC_CHILDREN\n" + " static struct pid *\n" + " get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)\n" + "diff --git a/fs/proc/base.c b/fs/proc/base.c\n" + "index 719c2e943ea1..19f194940c80 100644\n" + "--- a/fs/proc/base.c\n" + "+++ b/fs/proc/base.c\n" + "@@ -2916,6 +2916,7 @@ static const struct pid_entry tgid_base_stuff[] = {\n" + " \tREG(\"cmdline\", S_IRUGO, proc_pid_cmdline_ops),\n" + " \tONE(\"stat\", S_IRUGO, proc_tgid_stat),\n" + " \tONE(\"statm\", S_IRUGO, proc_pid_statm),\n" + "+\tONE(\"memdelay\", S_IRUGO, proc_pid_memdelay),\n" + " \tREG(\"maps\", S_IRUGO, proc_pid_maps_operations),\n" + " #ifdef CONFIG_NUMA\n" + " \tREG(\"numa_maps\", S_IRUGO, proc_pid_numa_maps_operations),\n" + "@@ -3307,6 +3308,7 @@ static const struct pid_entry tid_base_stuff[] = {\n" + " \tREG(\"cmdline\", S_IRUGO, proc_pid_cmdline_ops),\n" + " \tONE(\"stat\", S_IRUGO, proc_tid_stat),\n" + " \tONE(\"statm\", S_IRUGO, proc_pid_statm),\n" + "+\tONE(\"memdelay\", S_IRUGO, proc_pid_memdelay),\n" + " \tREG(\"maps\", S_IRUGO, proc_tid_maps_operations),\n" + " #ifdef CONFIG_PROC_CHILDREN\n" + " \tREG(\"children\", S_IRUGO, proc_tid_children_operations),\n" + "diff --git a/fs/proc/internal.h b/fs/proc/internal.h\n" + "index aa2b89071630..7ab706c316b8 100644\n" + "--- a/fs/proc/internal.h\n" + "+++ b/fs/proc/internal.h\n" + "@@ -146,6 +146,8 @@ extern int proc_pid_status(struct seq_file *, struct pid_namespace *,\n" + " \t\t\t struct pid *, struct task_struct *);\n" + " extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,\n" + " \t\t\t struct pid *, struct task_struct *);\n" + "+extern int proc_pid_memdelay(struct seq_file *, struct pid_namespace *,\n" + "+\t\t\t struct pid *, struct task_struct *);\n" + " \n" + " /*\n" + " * base.c\n" + "diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h\n" + "index 9b15a4bcfa77..1f720d3090f7 100644\n" + "--- a/include/linux/memcontrol.h\n" + "+++ b/include/linux/memcontrol.h\n" + "@@ -30,6 +30,7 @@\n" + " #include <linux/vmstat.h>\n" + " #include <linux/writeback.h>\n" + " #include <linux/page-flags.h>\n" + "+#include <linux/memdelay.h>\n" + " \n" + " struct mem_cgroup;\n" + " struct page;\n" + "@@ -183,6 +184,9 @@ struct mem_cgroup {\n" + " \n" + " \tunsigned long soft_limit;\n" + " \n" + "+\t/* Memory delay measurement domain */\n" + "+\tstruct memdelay_domain *memdelay_domain;\n" + "+\n" + " \t/* vmpressure notifications */\n" + " \tstruct vmpressure vmpressure;\n" + " \n" + "@@ -728,6 +732,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,\n" + " \treturn &pgdat->lruvec;\n" + " }\n" + " \n" + "+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)\n" + "+{\n" + "+\treturn NULL;\n" + "+}\n" + "+\n" + " static inline bool mm_match_cgroup(struct mm_struct *mm,\n" + " \t\tstruct mem_cgroup *memcg)\n" + " {\n" + "@@ -740,6 +749,11 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,\n" + " \treturn true;\n" + " }\n" + " \n" + "+static inline struct mem_cgroup *mem_cgroup_from_task(struct task_struct *task)\n" + "+{\n" + "+\treturn NULL;\n" + "+}\n" + "+\n" + " static inline struct mem_cgroup *\n" + " mem_cgroup_iter(struct mem_cgroup *root,\n" + " \t\tstruct mem_cgroup *prev,\n" + "diff --git a/include/linux/memdelay.h b/include/linux/memdelay.h\n" + "new file mode 100644\n" + "index 000000000000..08ed4e4baedf\n" + "--- /dev/null\n" + "+++ b/include/linux/memdelay.h\n" + "@@ -0,0 +1,182 @@\n" + "+#ifndef _LINUX_MEMDELAY_H\n" + "+#define _LINUX_MEMDELAY_H\n" + "+\n" + "+#include <linux/spinlock_types.h>\n" + "+#include <linux/sched.h>\n" + "+\n" + "+struct seq_file;\n" + "+struct css_set;\n" + "+\n" + "+/*\n" + "+ * Task productivity states tracked by the scheduler\n" + "+ */\n" + "+enum memdelay_task_state {\n" + "+\tMTS_NONE,\t\t/* Idle/unqueued/untracked */\n" + "+\tMTS_IOWAIT,\t\t/* Waiting for IO, not memory delayed */\n" + "+\tMTS_RUNNABLE,\t\t/* On the runqueue, not memory delayed */\n" + "+\tMTS_DELAYED,\t\t/* Memory delayed, not running */\n" + "+\tMTS_DELAYED_ACTIVE,\t/* Memory delayed, actively running */\n" + "+\tNR_MEMDELAY_TASK_STATES,\n" + "+};\n" + "+\n" + "+/*\n" + "+ * System/cgroup delay state tracked by the VM, composed of the\n" + "+ * productivity states of all tasks inside the domain.\n" + "+ */\n" + "+enum memdelay_domain_state {\n" + "+\tMDS_NONE,\t\t/* No delayed tasks */\n" + "+\tMDS_SOME,\t\t/* Delayed tasks, working tasks */\n" + "+\tMDS_FULL,\t\t/* Delayed tasks, no working tasks */\n" + "+\tNR_MEMDELAY_DOMAIN_STATES,\n" + "+};\n" + "+\n" + "+struct memdelay_domain_cpu {\n" + "+\t/* Task states of the domain on this CPU */\n" + "+\tint tasks[NR_MEMDELAY_TASK_STATES];\n" + "+\n" + "+\t/* Delay state of the domain on this CPU */\n" + "+\tenum memdelay_domain_state state;\n" + "+\n" + "+\t/* Time of last state change */\n" + "+\tu64 state_start;\n" + "+};\n" + "+\n" + "+struct memdelay_domain {\n" + "+\t/* Aggregate delayed time of all domain tasks */\n" + "+\tunsigned long aggregate;\n" + "+\n" + "+\t/* Per-CPU delay states in the domain */\n" + "+\tstruct memdelay_domain_cpu __percpu *mdcs;\n" + "+\n" + "+\t/* Cumulative state times from all CPUs */\n" + "+\tunsigned long times[NR_MEMDELAY_DOMAIN_STATES];\n" + "+\n" + "+\t/* Decaying state time averages over 1m, 5m, 15m */\n" + "+\tunsigned long period_expires;\n" + "+\tunsigned long avg_full[3];\n" + "+\tunsigned long avg_some[3];\n" + "+};\n" + "+\n" + "+/* mm/memdelay.c */\n" + "+extern struct memdelay_domain memdelay_global_domain;\n" + "+void memdelay_init(void);\n" + "+void memdelay_task_change(struct task_struct *task,\n" + "+\t\t\t enum memdelay_task_state old,\n" + "+\t\t\t enum memdelay_task_state new);\n" + "+struct memdelay_domain *memdelay_domain_alloc(void);\n" + "+void memdelay_domain_free(struct memdelay_domain *md);\n" + "+int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md);\n" + "+\n" + "+/* kernel/sched/memdelay.c */\n" + "+void memdelay_enter(unsigned long *flags);\n" + "+void memdelay_leave(unsigned long *flags);\n" + "+\n" + "+/**\n" + "+ * memdelay_schedule - note a context switch\n" + "+ * @prev: task scheduling out\n" + "+ * @next: task scheduling in\n" + "+ *\n" + "+ * A task switch doesn't affect the balance between delayed and\n" + "+ * productive tasks, but we have to update whether the delay is\n" + "+ * actively using the CPU or not.\n" + "+ */\n" + "+static inline void memdelay_schedule(struct task_struct *prev,\n" + "+\t\t\t\t struct task_struct *next)\n" + "+{\n" + "+\tif (prev->flags & PF_MEMDELAY)\n" + "+\t\tmemdelay_task_change(prev, MTS_DELAYED_ACTIVE, MTS_DELAYED);\n" + "+\n" + "+\tif (next->flags & PF_MEMDELAY)\n" + "+\t\tmemdelay_task_change(next, MTS_DELAYED, MTS_DELAYED_ACTIVE);\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_wakeup - note a task waking up\n" + "+ * @task: the task\n" + "+ *\n" + "+ * Notes an idle task becoming productive. Delayed tasks remain\n" + "+ * delayed even when they become runnable.\n" + "+ */\n" + "+static inline void memdelay_wakeup(struct task_struct *task)\n" + "+{\n" + "+\tif (task->flags & PF_MEMDELAY)\n" + "+\t\treturn;\n" + "+\n" + "+\tif (task->in_iowait)\n" + "+\t\tmemdelay_task_change(task, MTS_IOWAIT, MTS_RUNNABLE);\n" + "+\telse\n" + "+\t\tmemdelay_task_change(task, MTS_NONE, MTS_RUNNABLE);\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_wakeup - note a task going to sleep\n" + "+ * @task: the task\n" + "+ *\n" + "+ * Notes a working tasks becoming unproductive. Delayed tasks remain\n" + "+ * delayed.\n" + "+ */\n" + "+static inline void memdelay_sleep(struct task_struct *task)\n" + "+{\n" + "+\tif (task->flags & PF_MEMDELAY)\n" + "+\t\treturn;\n" + "+\n" + "+\tif (task->in_iowait)\n" + "+\t\tmemdelay_task_change(task, MTS_RUNNABLE, MTS_IOWAIT);\n" + "+\telse\n" + "+\t\tmemdelay_task_change(task, MTS_RUNNABLE, MTS_NONE);\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_del_add - track task movement between runqueues\n" + "+ * @task: the task\n" + "+ * @runnable: a runnable task is moved if %true, unqueued otherwise\n" + "+ * @add: task is being added if %true, removed otherwise\n" + "+ *\n" + "+ * Update the memdelay domain per-cpu states as tasks are being moved\n" + "+ * around the runqueues.\n" + "+ */\n" + "+static inline void memdelay_del_add(struct task_struct *task,\n" + "+\t\t\t\t bool runnable, bool add)\n" + "+{\n" + "+\tint state;\n" + "+\n" + "+\tif (task->flags & PF_MEMDELAY)\n" + "+\t\tstate = MTS_DELAYED;\n" + "+\telse if (runnable)\n" + "+\t\tstate = MTS_RUNNABLE;\n" + "+\telse if (task->in_iowait)\n" + "+\t\tstate = MTS_IOWAIT;\n" + "+\telse\n" + "+\t\treturn; /* already MTS_NONE */\n" + "+\n" + "+\tif (add)\n" + "+\t\tmemdelay_task_change(task, MTS_NONE, state);\n" + "+\telse\n" + "+\t\tmemdelay_task_change(task, state, MTS_NONE);\n" + "+}\n" + "+\n" + "+static inline void memdelay_del_runnable(struct task_struct *task)\n" + "+{\n" + "+\tmemdelay_del_add(task, true, false);\n" + "+}\n" + "+\n" + "+static inline void memdelay_add_runnable(struct task_struct *task)\n" + "+{\n" + "+\tmemdelay_del_add(task, true, true);\n" + "+}\n" + "+\n" + "+static inline void memdelay_del_sleeping(struct task_struct *task)\n" + "+{\n" + "+\tmemdelay_del_add(task, false, false);\n" + "+}\n" + "+\n" + "+static inline void memdelay_add_sleeping(struct task_struct *task)\n" + "+{\n" + "+\tmemdelay_del_add(task, false, true);\n" + "+}\n" + "+\n" + "+#ifdef CONFIG_CGROUPS\n" + "+void cgroup_move_task(struct task_struct *task, struct css_set *to);\n" + "+#endif\n" + "+\n" + "+#endif /* _LINUX_MEMDELAY_H */\n" + "diff --git a/include/linux/sched.h b/include/linux/sched.h\n" + "index c05ac5f5aa03..de15e3c8c43a 100644\n" + "--- a/include/linux/sched.h\n" + "+++ b/include/linux/sched.h\n" + "@@ -651,6 +651,7 @@ struct task_struct {\n" + " \t/* disallow userland-initiated cgroup migration */\n" + " \tunsigned\t\t\tno_cgroup_migration:1;\n" + " #endif\n" + "+\tunsigned\t\t\tmemdelay_migrate_enqueue:1;\n" + " \n" + " \tunsigned long\t\t\tatomic_flags; /* Flags requiring atomic access. */\n" + " \n" + "@@ -871,6 +872,12 @@ struct task_struct {\n" + " \n" + " \tstruct io_context\t\t*io_context;\n" + " \n" + "+\tu64\t\t\t\tmemdelay_start;\n" + "+\tunsigned long\t\t\tmemdelay_total;\n" + "+#ifdef CONFIG_DEBUG_VM\n" + "+\tint\t\t\t\tmemdelay_state;\n" + "+#endif\n" + "+\n" + " \t/* Ptrace state: */\n" + " \tunsigned long\t\t\tptrace_message;\n" + " \tsiginfo_t\t\t\t*last_siginfo;\n" + "@@ -1274,6 +1281,7 @@ extern struct pid *cad_pid;\n" + " #define PF_KTHREAD\t\t0x00200000\t/* I am a kernel thread */\n" + " #define PF_RANDOMIZE\t\t0x00400000\t/* Randomize virtual address space */\n" + " #define PF_SWAPWRITE\t\t0x00800000\t/* Allowed to write to swap */\n" + "+#define PF_MEMDELAY\t\t0x01000000\t/* Delayed due to lack of memory */\n" + " #define PF_NO_SETAFFINITY\t0x04000000\t/* Userland is not allowed to meddle with cpus_allowed */\n" + " #define PF_MCE_EARLY\t\t0x08000000 /* Early kill for mce process policy */\n" + " #define PF_MUTEX_TESTER\t\t0x20000000\t/* Thread belongs to the rt mutex tester */\n" + "diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c\n" + "index df2e0f14a95d..930aaef50396 100644\n" + "--- a/kernel/cgroup/cgroup.c\n" + "+++ b/kernel/cgroup/cgroup.c\n" + "@@ -699,7 +699,8 @@ static void css_set_move_task(struct task_struct *task,\n" + " \t\t */\n" + " \t\tWARN_ON_ONCE(task->flags & PF_EXITING);\n" + " \n" + "-\t\trcu_assign_pointer(task->cgroups, to_cset);\n" + "+\t\tcgroup_move_task(task, to_cset);\n" + "+\n" + " \t\tlist_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :\n" + " \t\t\t\t\t\t\t &to_cset->tasks);\n" + " \t}\n" + "diff --git a/kernel/fork.c b/kernel/fork.c\n" + "index b7e9e57b71ea..96dd35393be9 100644\n" + "--- a/kernel/fork.c\n" + "+++ b/kernel/fork.c\n" + "@@ -1208,6 +1208,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)\n" + " \tint retval;\n" + " \n" + " \ttsk->min_flt = tsk->maj_flt = 0;\n" + "+\ttsk->memdelay_total = 0;\n" + "+#ifdef CONFIG_DEBUG_VM\n" + "+\ttsk->memdelay_state = 0;\n" + "+#endif\n" + " \ttsk->nvcsw = tsk->nivcsw = 0;\n" + " #ifdef CONFIG_DETECT_HUNG_TASK\n" + " \ttsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;\n" + "diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile\n" + "index 53f0164ed362..84390fc42f60 100644\n" + "--- a/kernel/sched/Makefile\n" + "+++ b/kernel/sched/Makefile\n" + "@@ -17,7 +17,7 @@ endif\n" + " \n" + " obj-y += core.o loadavg.o clock.o cputime.o\n" + " obj-y += idle_task.o fair.o rt.o deadline.o\n" + "-obj-y += wait.o wait_bit.o swait.o completion.o idle.o\n" + "+obj-y += wait.o wait_bit.o swait.o completion.o idle.o memdelay.o\n" + " obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o\n" + " obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o\n" + " obj-$(CONFIG_SCHEDSTATS) += stats.o\n" + "diff --git a/kernel/sched/core.c b/kernel/sched/core.c\n" + "index 0869b20fba81..bf105c870da6 100644\n" + "--- a/kernel/sched/core.c\n" + "+++ b/kernel/sched/core.c\n" + "@@ -26,6 +26,7 @@\n" + " #include <linux/profile.h>\n" + " #include <linux/security.h>\n" + " #include <linux/syscalls.h>\n" + "+#include <linux/memdelay.h>\n" + " \n" + " #include <asm/switch_to.h>\n" + " #include <asm/tlb.h>\n" + "@@ -759,6 +760,14 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)\n" + " \tif (!(flags & ENQUEUE_RESTORE))\n" + " \t\tsched_info_queued(rq, p);\n" + " \n" + "+\tWARN_ON_ONCE(!(flags & ENQUEUE_WAKEUP) && p->memdelay_migrate_enqueue);\n" + "+\tif (!(flags & ENQUEUE_WAKEUP) || p->memdelay_migrate_enqueue) {\n" + "+\t\tmemdelay_add_runnable(p);\n" + "+\t\tp->memdelay_migrate_enqueue = 0;\n" + "+\t} else {\n" + "+\t\tmemdelay_wakeup(p);\n" + "+\t}\n" + "+\n" + " \tp->sched_class->enqueue_task(rq, p, flags);\n" + " }\n" + " \n" + "@@ -770,6 +779,11 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)\n" + " \tif (!(flags & DEQUEUE_SAVE))\n" + " \t\tsched_info_dequeued(rq, p);\n" + " \n" + "+\tif (!(flags & DEQUEUE_SLEEP))\n" + "+\t\tmemdelay_del_runnable(p);\n" + "+\telse\n" + "+\t\tmemdelay_sleep(p);\n" + "+\n" + " \tp->sched_class->dequeue_task(rq, p, flags);\n" + " }\n" + " \n" + "@@ -2044,7 +2058,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)\n" + " \n" + " \tcpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);\n" + " \tif (task_cpu(p) != cpu) {\n" + "+\t\tstruct rq_flags rf;\n" + "+\t\tstruct rq *rq;\n" + "+\n" + " \t\twake_flags |= WF_MIGRATED;\n" + "+\n" + "+\t\trq = __task_rq_lock(p, &rf);\n" + "+\t\tmemdelay_del_sleeping(p);\n" + "+\t\t__task_rq_unlock(rq, &rf);\n" + "+\t\tp->memdelay_migrate_enqueue = 1;\n" + "+\n" + " \t\tset_task_cpu(p, cpu);\n" + " \t}\n" + " \n" + "@@ -3326,6 +3349,8 @@ static void __sched notrace __schedule(bool preempt)\n" + " \t\trq->curr = next;\n" + " \t\t++*switch_count;\n" + " \n" + "+\t\tmemdelay_schedule(prev, next);\n" + "+\n" + " \t\ttrace_sched_switch(preempt, prev, next);\n" + " \n" + " \t\t/* Also unlocks the rq: */\n" + "@@ -5919,6 +5944,8 @@ void __init sched_init(void)\n" + " \n" + " \tinit_schedstats();\n" + " \n" + "+\tmemdelay_init();\n" + "+\n" + " \tscheduler_running = 1;\n" + " }\n" + " \n" + "diff --git a/kernel/sched/memdelay.c b/kernel/sched/memdelay.c\n" + "new file mode 100644\n" + "index 000000000000..1d4813cd018a\n" + "--- /dev/null\n" + "+++ b/kernel/sched/memdelay.c\n" + "@@ -0,0 +1,118 @@\n" + "+/*\n" + "+ * Memory delay metric\n" + "+ *\n" + "+ * Copyright (c) 2017 Facebook, Johannes Weiner\n" + "+ *\n" + "+ * This code quantifies and reports to userspace the wall-time impact\n" + "+ * of memory pressure on the system and memory-controlled cgroups.\n" + "+ */\n" + "+\n" + "+#include <linux/memdelay.h>\n" + "+#include <linux/cgroup.h>\n" + "+#include <linux/sched.h>\n" + "+\n" + "+#include \"sched.h\"\n" + "+\n" + "+/**\n" + "+ * memdelay_enter - mark the beginning of a memory delay section\n" + "+ * @flags: flags to handle nested memdelay sections\n" + "+ *\n" + "+ * Marks the calling task as being delayed due to a lack of memory,\n" + "+ * such as waiting for a workingset refault or performing reclaim.\n" + "+ */\n" + "+void memdelay_enter(unsigned long *flags)\n" + "+{\n" + "+\tstruct rq_flags rf;\n" + "+\tstruct rq *rq;\n" + "+\n" + "+\t*flags = current->flags & PF_MEMDELAY;\n" + "+\tif (*flags)\n" + "+\t\treturn;\n" + "+\t/*\n" + "+\t * PF_MEMDELAY & accounting needs to be atomic wrt changes to\n" + "+\t * the task's scheduling state and its domain association.\n" + "+\t * Otherwise we could race with CPU or cgroup migration and\n" + "+\t * misaccount.\n" + "+\t */\n" + "+\tlocal_irq_disable();\n" + "+\trq = this_rq();\n" + "+\trq_lock(rq, &rf);\n" + "+\n" + "+\tcurrent->flags |= PF_MEMDELAY;\n" + "+\tmemdelay_task_change(current, MTS_RUNNABLE, MTS_DELAYED_ACTIVE);\n" + "+\n" + "+\trq_unlock(rq, &rf);\n" + "+\tlocal_irq_enable();\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_leave - mark the end of a memory delay section\n" + "+ * @flags: flags to handle nested memdelay sections\n" + "+ *\n" + "+ * Marks the calling task as no longer delayed due to memory.\n" + "+ */\n" + "+void memdelay_leave(unsigned long *flags)\n" + "+{\n" + "+\tstruct rq_flags rf;\n" + "+\tstruct rq *rq;\n" + "+\n" + "+\tif (*flags)\n" + "+\t\treturn;\n" + "+\t/*\n" + "+\t * PF_MEMDELAY & accounting needs to be atomic wrt changes to\n" + "+\t * the task's scheduling state and its domain association.\n" + "+\t * Otherwise we could race with CPU or cgroup migration and\n" + "+\t * misaccount.\n" + "+\t */\n" + "+\tlocal_irq_disable();\n" + "+\trq = this_rq();\n" + "+\trq_lock(rq, &rf);\n" + "+\n" + "+\tcurrent->flags &= ~PF_MEMDELAY;\n" + "+\tmemdelay_task_change(current, MTS_DELAYED_ACTIVE, MTS_RUNNABLE);\n" + "+\n" + "+\trq_unlock(rq, &rf);\n" + "+\tlocal_irq_enable();\n" + "+}\n" + "+\n" + "+#ifdef CONFIG_CGROUPS\n" + "+/**\n" + "+ * cgroup_move_task - move task to a different cgroup\n" + "+ * @task: the task\n" + "+ * @to: the target css_set\n" + "+ *\n" + "+ * Move task to a new cgroup and safely migrate its associated\n" + "+ * delayed/working state between the different domains.\n" + "+ *\n" + "+ * This function acquires the task's rq lock to lock out concurrent\n" + "+ * changes to the task's scheduling state and - in case the task is\n" + "+ * running - concurrent changes to its delay state.\n" + "+ */\n" + "+void cgroup_move_task(struct task_struct *task, struct css_set *to)\n" + "+{\n" + "+\tstruct rq_flags rf;\n" + "+\tstruct rq *rq;\n" + "+\tint state;\n" + "+\n" + "+\trq = task_rq_lock(task, &rf);\n" + "+\n" + "+\tif (task->flags & PF_MEMDELAY)\n" + "+\t\tstate = MTS_DELAYED + task_current(rq, task);\n" + "+\telse if (task_on_rq_queued(task))\n" + "+\t\tstate = MTS_RUNNABLE;\n" + "+\telse if (task->in_iowait)\n" + "+\t\tstate = MTS_IOWAIT;\n" + "+\telse\n" + "+\t\tstate = MTS_NONE;\n" + "+\n" + "+\t/*\n" + "+\t * Lame to do this here, but the scheduler cannot be locked\n" + "+\t * from the outside, so we move cgroups from inside sched/.\n" + "+\t */\n" + "+\tmemdelay_task_change(task, state, MTS_NONE);\n" + "+\trcu_assign_pointer(task->cgroups, to);\n" + "+\tmemdelay_task_change(task, MTS_NONE, state);\n" + "+\n" + "+\ttask_rq_unlock(rq, task, &rf);\n" + "+}\n" + "+#endif /* CONFIG_CGROUPS */\n" + "diff --git a/mm/Makefile b/mm/Makefile\n" + "index 411bd24d4a7c..c9bdbc5627e5 100644\n" + "--- a/mm/Makefile\n" + "+++ b/mm/Makefile\n" + "@@ -39,7 +39,7 @@ obj-y\t\t\t:= filemap.o mempool.o oom_kill.o \\\n" + " \t\t\t mm_init.o mmu_context.o percpu.o slab_common.o \\\n" + " \t\t\t compaction.o vmacache.o swap_slots.o \\\n" + " \t\t\t interval_tree.o list_lru.o workingset.o \\\n" + "-\t\t\t debug.o $(mmu-y)\n" + "+\t\t\t memdelay.o debug.o $(mmu-y)\n" + " \n" + " obj-y += init-mm.o\n" + " \n" + "diff --git a/mm/compaction.c b/mm/compaction.c\n" + "index fb548e4c7bd4..adf67de23fee 100644\n" + "--- a/mm/compaction.c\n" + "+++ b/mm/compaction.c\n" + "@@ -2040,11 +2040,15 @@ static int kcompactd(void *p)\n" + " \tpgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;\n" + " \n" + " \twhile (!kthread_should_stop()) {\n" + "+\t\tunsigned long mdflags;\n" + "+\n" + " \t\ttrace_mm_compaction_kcompactd_sleep(pgdat->node_id);\n" + " \t\twait_event_freezable(pgdat->kcompactd_wait,\n" + " \t\t\t\tkcompactd_work_requested(pgdat));\n" + " \n" + "+\t\tmemdelay_enter(&mdflags);\n" + " \t\tkcompactd_do_work(pgdat);\n" + "+\t\tmemdelay_leave(&mdflags);\n" + " \t}\n" + " \n" + " \treturn 0;\n" + "diff --git a/mm/filemap.c b/mm/filemap.c\n" + "index da55a5693da9..648418694405 100644\n" + "--- a/mm/filemap.c\n" + "+++ b/mm/filemap.c\n" + "@@ -36,6 +36,7 @@\n" + " #include <linux/memcontrol.h>\n" + " #include <linux/cleancache.h>\n" + " #include <linux/rmap.h>\n" + "+#include <linux/memdelay.h>\n" + " #include \"internal.h\"\n" + " \n" + " #define CREATE_TRACE_POINTS\n" + "@@ -961,8 +962,15 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,\n" + " {\n" + " \tstruct wait_page_queue wait_page;\n" + " \twait_queue_entry_t *wait = &wait_page.wait;\n" + "+\tunsigned long mdflags;\n" + "+\tbool refault = false;\n" + " \tint ret = 0;\n" + " \n" + "+\tif (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) {\n" + "+\t\tmemdelay_enter(&mdflags);\n" + "+\t\trefault = true;\n" + "+\t}\n" + "+\n" + " \tinit_wait(wait);\n" + " \twait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;\n" + " \twait->func = wake_page_function;\n" + "@@ -1001,6 +1009,9 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,\n" + " \n" + " \tfinish_wait(q, wait);\n" + " \n" + "+\tif (refault)\n" + "+\t\tmemdelay_leave(&mdflags);\n" + "+\n" + " \t/*\n" + " \t * A signal could leave PageWaiters set. Clearing it here if\n" + " \t * !waitqueue_active would be possible (by open-coding finish_wait),\n" + "diff --git a/mm/memcontrol.c b/mm/memcontrol.c\n" + "index 93b2eb063afd..102f0f4d3f5c 100644\n" + "--- a/mm/memcontrol.c\n" + "+++ b/mm/memcontrol.c\n" + "@@ -65,6 +65,7 @@\n" + " #include <linux/lockdep.h>\n" + " #include <linux/file.h>\n" + " #include <linux/tracehook.h>\n" + "+#include <linux/memdelay.h>\n" + " #include \"internal.h\"\n" + " #include <net/sock.h>\n" + " #include <net/ip.h>\n" + "@@ -3926,6 +3927,8 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,\n" + " \treturn ret;\n" + " }\n" + " \n" + "+static int memory_memdelay_show(struct seq_file *m, void *v);\n" + "+\n" + " static struct cftype mem_cgroup_legacy_files[] = {\n" + " \t{\n" + " \t\t.name = \"usage_in_bytes\",\n" + "@@ -3993,6 +3996,10 @@ static struct cftype mem_cgroup_legacy_files[] = {\n" + " \t{\n" + " \t\t.name = \"pressure_level\",\n" + " \t},\n" + "+\t{\n" + "+\t\t.name = \"memdelay\",\n" + "+\t\t.seq_show = memory_memdelay_show,\n" + "+\t},\n" + " #ifdef CONFIG_NUMA\n" + " \t{\n" + " \t\t.name = \"numa_stat\",\n" + "@@ -4170,6 +4177,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)\n" + " \n" + " \tfor_each_node(node)\n" + " \t\tfree_mem_cgroup_per_node_info(memcg, node);\n" + "+\tmemdelay_domain_free(memcg->memdelay_domain);\n" + " \tfree_percpu(memcg->stat);\n" + " \tkfree(memcg);\n" + " }\n" + "@@ -4275,10 +4283,15 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)\n" + " \n" + " \t/* The following stuff does not apply to the root */\n" + " \tif (!parent) {\n" + "+\t\tmemcg->memdelay_domain = &memdelay_global_domain;\n" + " \t\troot_mem_cgroup = memcg;\n" + " \t\treturn &memcg->css;\n" + " \t}\n" + " \n" + "+\tmemcg->memdelay_domain = memdelay_domain_alloc();\n" + "+\tif (!memcg->memdelay_domain)\n" + "+\t\tgoto fail;\n" + "+\n" + " \terror = memcg_online_kmem(memcg);\n" + " \tif (error)\n" + " \t\tgoto fail;\n" + "@@ -5282,6 +5295,13 @@ static int memory_stat_show(struct seq_file *m, void *v)\n" + " \treturn 0;\n" + " }\n" + " \n" + "+static int memory_memdelay_show(struct seq_file *m, void *v)\n" + "+{\n" + "+\tstruct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));\n" + "+\n" + "+\treturn memdelay_domain_show(m, memcg->memdelay_domain);\n" + "+}\n" + "+\n" + " static struct cftype memory_files[] = {\n" + " \t{\n" + " \t\t.name = \"current\",\n" + "@@ -5317,6 +5337,11 @@ static struct cftype memory_files[] = {\n" + " \t\t.flags = CFTYPE_NOT_ON_ROOT,\n" + " \t\t.seq_show = memory_stat_show,\n" + " \t},\n" + "+\t{\n" + "+\t\t.name = \"memdelay\",\n" + "+\t\t.flags = CFTYPE_NOT_ON_ROOT,\n" + "+\t\t.seq_show = memory_memdelay_show,\n" + "+\t},\n" + " \t{ }\t/* terminate */\n" + " };\n" + " \n" + "diff --git a/mm/memdelay.c b/mm/memdelay.c\n" + "new file mode 100644\n" + "index 000000000000..c43d6f7ba22a\n" + "--- /dev/null\n" + "+++ b/mm/memdelay.c\n" + "@@ -0,0 +1,285 @@\n" + "+/*\n" + "+ * Memory delay metric\n" + "+ *\n" + "+ * Copyright (c) 2017 Facebook, Johannes Weiner\n" + "+ *\n" + "+ * This code quantifies and reports to userspace the wall-time impact\n" + "+ * of memory pressure on the system and memory-controlled cgroups.\n" + "+ */\n" + "+\n" + "+#include <linux/sched/loadavg.h>\n" + "+#include <linux/sched/clock.h>\n" + "+#include <linux/memcontrol.h>\n" + "+#include <linux/memdelay.h>\n" + "+#include <linux/seq_file.h>\n" + "+#include <linux/proc_fs.h>\n" + "+#include <linux/kernel.h>\n" + "+#include <linux/module.h>\n" + "+#include <linux/slab.h>\n" + "+#include <linux/fs.h>\n" + "+\n" + "+static DEFINE_PER_CPU(struct memdelay_domain_cpu, global_domain_cpus);\n" + "+\n" + "+/* System-level keeping of memory delay statistics */\n" + "+struct memdelay_domain memdelay_global_domain = {\n" + "+\t.mdcs = &global_domain_cpus,\n" + "+};\n" + "+\n" + "+static void domain_init(struct memdelay_domain *md)\n" + "+{\n" + "+\tmd->period_expires = jiffies + LOAD_FREQ;\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_init - initialize the memdelay subsystem\n" + "+ *\n" + "+ * This needs to run before the scheduler starts queuing and\n" + "+ * scheduling tasks.\n" + "+ */\n" + "+void __init memdelay_init(void)\n" + "+{\n" + "+\tdomain_init(&memdelay_global_domain);\n" + "+}\n" + "+\n" + "+static void domain_move_clock(struct memdelay_domain *md)\n" + "+{\n" + "+\tunsigned long expires = READ_ONCE(md->period_expires);\n" + "+\tunsigned long none, some, full;\n" + "+\tint missed_periods;\n" + "+\tunsigned long next;\n" + "+\tint i;\n" + "+\n" + "+\tif (time_before(jiffies, expires))\n" + "+\t\treturn;\n" + "+\n" + "+\tmissed_periods = 1 + (jiffies - expires) / LOAD_FREQ;\n" + "+\tnext = expires + (missed_periods * LOAD_FREQ);\n" + "+\n" + "+\tif (cmpxchg(&md->period_expires, expires, next) != expires)\n" + "+\t\treturn;\n" + "+\n" + "+\tnone = xchg(&md->times[MDS_NONE], 0);\n" + "+\tsome = xchg(&md->times[MDS_SOME], 0);\n" + "+\tfull = xchg(&md->times[MDS_FULL], 0);\n" + "+\n" + "+\tfor (i = 0; i < missed_periods; i++) {\n" + "+\t\tunsigned long pct;\n" + "+\n" + "+\t\tpct = some * 100 / max(none + some + full, 1UL);\n" + "+\t\tpct *= FIXED_1;\n" + "+\t\tCALC_LOAD(md->avg_some[0], EXP_1, pct);\n" + "+\t\tCALC_LOAD(md->avg_some[1], EXP_5, pct);\n" + "+\t\tCALC_LOAD(md->avg_some[2], EXP_15, pct);\n" + "+\n" + "+\t\tpct = full * 100 / max(none + some + full, 1UL);\n" + "+\t\tpct *= FIXED_1;\n" + "+\t\tCALC_LOAD(md->avg_full[0], EXP_1, pct);\n" + "+\t\tCALC_LOAD(md->avg_full[1], EXP_5, pct);\n" + "+\t\tCALC_LOAD(md->avg_full[2], EXP_15, pct);\n" + "+\n" + "+\t\tnone = some = full = 0;\n" + "+\t}\n" + "+}\n" + "+\n" + "+static void domain_cpu_update(struct memdelay_domain *md, int cpu,\n" + "+\t\t\t enum memdelay_task_state old,\n" + "+\t\t\t enum memdelay_task_state new)\n" + "+{\n" + "+\tenum memdelay_domain_state state;\n" + "+\tstruct memdelay_domain_cpu *mdc;\n" + "+\tunsigned long delta;\n" + "+\tu64 now;\n" + "+\n" + "+\tmdc = per_cpu_ptr(md->mdcs, cpu);\n" + "+\n" + "+\tif (old) {\n" + "+\t\tWARN_ONCE(!mdc->tasks[old], \"cpu=%d old=%d new=%d counter=%d\\n\",\n" + "+\t\t\t cpu, old, new, mdc->tasks[old]);\n" + "+\t\tmdc->tasks[old] -= 1;\n" + "+\t}\n" + "+\tif (new)\n" + "+\t\tmdc->tasks[new] += 1;\n" + "+\n" + "+\t/*\n" + "+\t * The domain is somewhat delayed when a number of tasks are\n" + "+\t * delayed but there are still others running the workload.\n" + "+\t *\n" + "+\t * The domain is fully delayed when all non-idle tasks on the\n" + "+\t * CPU are delayed, or when a delayed task is actively running\n" + "+\t * and preventing productive tasks from making headway.\n" + "+\t *\n" + "+\t * The state times then add up over all CPUs in the domain: if\n" + "+\t * the domain is fully blocked on one CPU and there is another\n" + "+\t * one running the workload, the domain is considered fully\n" + "+\t * blocked 50% of the time.\n" + "+\t */\n" + "+\tif (mdc->tasks[MTS_DELAYED_ACTIVE] && !mdc->tasks[MTS_IOWAIT])\n" + "+\t\tstate = MDS_FULL;\n" + "+\telse if (mdc->tasks[MTS_DELAYED])\n" + "+\t\tstate = (mdc->tasks[MTS_RUNNABLE] || mdc->tasks[MTS_IOWAIT]) ?\n" + "+\t\t\tMDS_SOME : MDS_FULL;\n" + "+\telse\n" + "+\t\tstate = MDS_NONE;\n" + "+\n" + "+\tif (mdc->state == state)\n" + "+\t\treturn;\n" + "+\n" + "+\tnow = cpu_clock(cpu);\n" + "+\tdelta = (now - mdc->state_start) / NSEC_PER_USEC;\n" + "+\n" + "+\tdomain_move_clock(md);\n" + "+\tmd->times[mdc->state] += delta;\n" + "+\n" + "+\tmdc->state = state;\n" + "+\tmdc->state_start = now;\n" + "+}\n" + "+\n" + "+static struct memdelay_domain *memcg_domain(struct mem_cgroup *memcg)\n" + "+{\n" + "+#ifdef CONFIG_MEMCG\n" + "+\tif (!mem_cgroup_disabled())\n" + "+\t\treturn memcg->memdelay_domain;\n" + "+#endif\n" + "+\treturn &memdelay_global_domain;\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_task_change - note a task changing its delay/work state\n" + "+ * @task: the task changing state\n" + "+ * @old: old task state\n" + "+ * @new: new task state\n" + "+ *\n" + "+ * Updates the task's domain counters to reflect a change in the\n" + "+ * task's delayed/working state.\n" + "+ */\n" + "+void memdelay_task_change(struct task_struct *task,\n" + "+\t\t\t enum memdelay_task_state old,\n" + "+\t\t\t enum memdelay_task_state new)\n" + "+{\n" + "+\tint cpu = task_cpu(task);\n" + "+\tstruct mem_cgroup *memcg;\n" + "+\tunsigned long delay = 0;\n" + "+\n" + "+#ifdef CONFIG_DEBUG_VM\n" + "+\tWARN_ONCE(task->memdelay_state != old,\n" + "+\t\t \"cpu=%d task=%p state=%d (in_iowait=%d PF_MEMDELAYED=%d) old=%d new=%d\\n\",\n" + "+\t\t cpu, task, task->memdelay_state, task->in_iowait,\n" + "+\t\t !!(task->flags & PF_MEMDELAY), old, new);\n" + "+\ttask->memdelay_state = new;\n" + "+#endif\n" + "+\n" + "+\t/* Account when tasks are entering and leaving delays */\n" + "+\tif (old < MTS_DELAYED && new >= MTS_DELAYED) {\n" + "+\t\ttask->memdelay_start = cpu_clock(cpu);\n" + "+\t} else if (old >= MTS_DELAYED && new < MTS_DELAYED) {\n" + "+\t\tdelay = (cpu_clock(cpu) - task->memdelay_start) / NSEC_PER_USEC;\n" + "+\t\ttask->memdelay_total += delay;\n" + "+\t}\n" + "+\n" + "+\t/* Account domain state changes */\n" + "+\trcu_read_lock();\n" + "+\tmemcg = mem_cgroup_from_task(task);\n" + "+\tdo {\n" + "+\t\tstruct memdelay_domain *md;\n" + "+\n" + "+\t\tmd = memcg_domain(memcg);\n" + "+\t\tmd->aggregate += delay;\n" + "+\t\tdomain_cpu_update(md, cpu, old, new);\n" + "+\t} while (memcg && (memcg = parent_mem_cgroup(memcg)));\n" + "+\trcu_read_unlock();\n" + "+};\n" + "+\n" + "+/**\n" + "+ * memdelay_domain_alloc - allocate a cgroup memory delay domain\n" + "+ */\n" + "+struct memdelay_domain *memdelay_domain_alloc(void)\n" + "+{\n" + "+\tstruct memdelay_domain *md;\n" + "+\n" + "+\tmd = kzalloc(sizeof(*md), GFP_KERNEL);\n" + "+\tif (!md)\n" + "+\t\treturn NULL;\n" + "+\tmd->mdcs = alloc_percpu(struct memdelay_domain_cpu);\n" + "+\tif (!md->mdcs) {\n" + "+\t\tkfree(md);\n" + "+\t\treturn NULL;\n" + "+\t}\n" + "+\tdomain_init(md);\n" + "+\treturn md;\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_domain_free - free a cgroup memory delay domain\n" + "+ */\n" + "+void memdelay_domain_free(struct memdelay_domain *md)\n" + "+{\n" + "+\tif (md) {\n" + "+\t\tfree_percpu(md->mdcs);\n" + "+\t\tkfree(md);\n" + "+\t}\n" + "+}\n" + "+\n" + "+/**\n" + "+ * memdelay_domain_show - format memory delay domain stats to a seq_file\n" + "+ * @s: the seq_file\n" + "+ * @md: the memory domain\n" + "+ */\n" + "+int memdelay_domain_show(struct seq_file *s, struct memdelay_domain *md)\n" + "+{\n" + "+\tdomain_move_clock(md);\n" + "+\n" + "+\tseq_printf(s, \"%lu\\n\", md->aggregate);\n" + "+\n" + "+\tseq_printf(s, \"%lu.%02lu %lu.%02lu %lu.%02lu\\n\",\n" + "+\t\t LOAD_INT(md->avg_some[0]), LOAD_FRAC(md->avg_some[0]),\n" + "+\t\t LOAD_INT(md->avg_some[1]), LOAD_FRAC(md->avg_some[1]),\n" + "+\t\t LOAD_INT(md->avg_some[2]), LOAD_FRAC(md->avg_some[2]));\n" + "+\n" + "+\tseq_printf(s, \"%lu.%02lu %lu.%02lu %lu.%02lu\\n\",\n" + "+\t\t LOAD_INT(md->avg_full[0]), LOAD_FRAC(md->avg_full[0]),\n" + "+\t\t LOAD_INT(md->avg_full[1]), LOAD_FRAC(md->avg_full[1]),\n" + "+\t\t LOAD_INT(md->avg_full[2]), LOAD_FRAC(md->avg_full[2]));\n" + "+\n" + "+#ifdef CONFIG_DEBUG_VM\n" + "+\t{\n" + "+\t\tint cpu;\n" + "+\n" + "+\t\tfor_each_online_cpu(cpu) {\n" + "+\t\t\tstruct memdelay_domain_cpu *mdc;\n" + "+\n" + "+\t\t\tmdc = per_cpu_ptr(md->mdcs, cpu);\n" + "+\t\t\tseq_printf(s, \"%d %d %d %d\\n\",\n" + "+\t\t\t\t mdc->tasks[MTS_IOWAIT],\n" + "+\t\t\t\t mdc->tasks[MTS_RUNNABLE],\n" + "+\t\t\t\t mdc->tasks[MTS_DELAYED],\n" + "+\t\t\t\t mdc->tasks[MTS_DELAYED_ACTIVE]);\n" + "+\t\t}\n" + "+\t}\n" + "+#endif\n" + "+\n" + "+\treturn 0;\n" + "+}\n" + "+\n" + "+static int memdelay_show(struct seq_file *m, void *v)\n" + "+{\n" + "+\treturn memdelay_domain_show(m, &memdelay_global_domain);\n" + "+}\n" + "+\n" + "+static int memdelay_open(struct inode *inode, struct file *file)\n" + "+{\n" + "+\treturn single_open(file, memdelay_show, NULL);\n" + "+}\n" + "+\n" + "+static const struct file_operations memdelay_fops = {\n" + "+\t.open = memdelay_open,\n" + "+\t.read = seq_read,\n" + "+\t.llseek = seq_lseek,\n" + "+\t.release = single_release,\n" + "+};\n" + "+\n" + "+static int __init memdelay_proc_init(void)\n" + "+{\n" + "+\tproc_create(\"memdelay\", 0, NULL, &memdelay_fops);\n" + "+\treturn 0;\n" + "+}\n" + "+module_init(memdelay_proc_init);\n" + "diff --git a/mm/page_alloc.c b/mm/page_alloc.c\n" + "index 1423da8dd16f..d8d01e9df982 100644\n" + "--- a/mm/page_alloc.c\n" + "+++ b/mm/page_alloc.c\n" + "@@ -67,6 +67,7 @@\n" + " #include <linux/memcontrol.h>\n" + " #include <linux/ftrace.h>\n" + " #include <linux/nmi.h>\n" + "+#include <linux/memdelay.h>\n" + " \n" + " #include <asm/sections.h>\n" + " #include <asm/tlbflush.h>\n" + "@@ -3364,16 +3365,19 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,\n" + " \t\tunsigned int alloc_flags, const struct alloc_context *ac,\n" + " \t\tenum compact_priority prio, enum compact_result *compact_result)\n" + " {\n" + "-\tstruct page *page;\n" + " \tunsigned int noreclaim_flag;\n" + "+\tunsigned long mdflags;\n" + "+\tstruct page *page;\n" + " \n" + " \tif (!order)\n" + " \t\treturn NULL;\n" + " \n" + "+\tmemdelay_enter(&mdflags);\n" + " \tnoreclaim_flag = memalloc_noreclaim_save();\n" + " \t*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,\n" + " \t\t\t\t\t\t\t\t\tprio);\n" + " \tmemalloc_noreclaim_restore(noreclaim_flag);\n" + "+\tmemdelay_leave(&mdflags);\n" + " \n" + " \tif (*compact_result <= COMPACT_INACTIVE)\n" + " \t\treturn NULL;\n" + "@@ -3519,13 +3523,15 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,\n" + " \t\t\t\t\tconst struct alloc_context *ac)\n" + " {\n" + " \tstruct reclaim_state reclaim_state;\n" + "-\tint progress;\n" + " \tunsigned int noreclaim_flag;\n" + "+\tunsigned long mdflags;\n" + "+\tint progress;\n" + " \n" + " \tcond_resched();\n" + " \n" + " \t/* We now go into synchronous reclaim */\n" + " \tcpuset_memory_pressure_bump();\n" + "+\tmemdelay_enter(&mdflags);\n" + " \tnoreclaim_flag = memalloc_noreclaim_save();\n" + " \tlockdep_set_current_reclaim_state(gfp_mask);\n" + " \treclaim_state.reclaimed_slab = 0;\n" + "@@ -3537,6 +3543,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,\n" + " \tcurrent->reclaim_state = NULL;\n" + " \tlockdep_clear_current_reclaim_state();\n" + " \tmemalloc_noreclaim_restore(noreclaim_flag);\n" + "+\tmemdelay_leave(&mdflags);\n" + " \n" + " \tcond_resched();\n" + " \n" + "diff --git a/mm/vmscan.c b/mm/vmscan.c\n" + "index 60357cd84c67..1029305b9b3a 100644\n" + "--- a/mm/vmscan.c\n" + "+++ b/mm/vmscan.c\n" + "@@ -48,6 +48,7 @@\n" + " #include <linux/prefetch.h>\n" + " #include <linux/printk.h>\n" + " #include <linux/dax.h>\n" + "+#include <linux/memdelay.h>\n" + " \n" + " #include <asm/tlbflush.h>\n" + " #include <asm/div64.h>\n" + "@@ -3098,6 +3099,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,\n" + " {\n" + " \tstruct zonelist *zonelist;\n" + " \tunsigned long nr_reclaimed;\n" + "+\tunsigned long mdflags;\n" + " \tint nid;\n" + " \tunsigned int noreclaim_flag;\n" + " \tstruct scan_control sc = {\n" + "@@ -3126,9 +3128,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,\n" + " \t\t\t\t\t sc.gfp_mask,\n" + " \t\t\t\t\t sc.reclaim_idx);\n" + " \n" + "+\tmemdelay_enter(&mdflags);\n" + " \tnoreclaim_flag = memalloc_noreclaim_save();\n" + " \tnr_reclaimed = do_try_to_free_pages(zonelist, &sc);\n" + " \tmemalloc_noreclaim_restore(noreclaim_flag);\n" + "+\tmemdelay_leave(&mdflags);\n" + " \n" + " \ttrace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);\n" + " \n" + "@@ -3550,6 +3554,7 @@ static int kswapd(void *p)\n" + " \tpgdat->kswapd_order = 0;\n" + " \tpgdat->kswapd_classzone_idx = MAX_NR_ZONES;\n" + " \tfor ( ; ; ) {\n" + "+\t\tunsigned long mdflags;\n" + " \t\tbool ret;\n" + " \n" + " \t\talloc_order = reclaim_order = pgdat->kswapd_order;\n" + "@@ -3586,7 +3591,11 @@ static int kswapd(void *p)\n" + " \t\t */\n" + " \t\ttrace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,\n" + " \t\t\t\t\t\talloc_order);\n" + "+\n" + "+\t\tmemdelay_enter(&mdflags);\n" + " \t\treclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);\n" + "+\t\tmemdelay_leave(&mdflags);\n" + "+\n" + " \t\tif (reclaim_order < alloc_order)\n" + " \t\t\tgoto kswapd_try_sleep;\n" + " \t}\n" + "-- \n" + 2.14.1 -b4b1585f187ee8e2be73d0a832d73e143305734a4c5face8904b27985255f21f +b24ffc48b913b11ce4c4452d3550c52327b6c13773cda91c3d49941302a848f0
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.