diff for duplicates of <20160726082701.GA9950@bbox> diff --git a/a/1.txt b/N1/1.txt index 2f42850..cb1d166 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -22,3 +22,226 @@ On Tue, Jul 26, 2016 at 08:46:50AM +0100, Mel Gorman wrote: Okay, Then how about this? I didn't test it but I guess it should work. + +>From 8f9f04045724d0d5752d100b94d060183e5a0f7d Mon Sep 17 00:00:00 2001 +From: Minchan Kim <minchan@kernel.org> +Date: Tue, 26 Jul 2016 11:57:27 +0900 +Subject: [PATCH] mm: get_scan_count consider reclaimable lru pages + +With node-lru, if there are enough reclaimable pages in highmem +but nothing in lowmem, VM can try to shrink inactive list although +the requested zone is lowmem. + +The problem is that if the inactive list is full of highmem pages then a +direct reclaimer searching for a lowmem page waste CPU scanning uselessly. +It just burns out CPU. Even, many direct reclaimers are stalled by +too_many_isolated if lots of parallel reclaimer are going on although +there are no reclaimable memory in inactive list. + +To solve the issue, get_scan_count should consider reclaimable +lru size. + +I tried the experiment 4 times in 32bit 2G 8 CPU KVM machine to get +elapsed time. + + hackbench 500 process 2 + += Old = + +1st: 289s 2nd: 310s 3rd: 112s 4th: 272s + += Now = + +1st: 31s 2nd: 132s 3rd: 162s 4th: 50s + +Not-yet-Signed-off-by: Minchan Kim <minchan@kernel.org> +--- + include/linux/mmzone.h | 3 +- + mm/vmscan.c | 91 ++++++++++++++++++++++---------------------------- + mm/workingset.c | 2 +- + 3 files changed, 43 insertions(+), 53 deletions(-) + +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index d572b78..87d186f 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -805,7 +805,8 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) + #endif + } + +-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru); ++extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, ++ int classzone); + + #ifdef CONFIG_HAVE_MEMORY_PRESENT + void memory_present(int nid, unsigned long start, unsigned long end); +diff --git a/mm/vmscan.c b/mm/vmscan.c +index e5af357..c27e307 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -234,12 +234,33 @@ bool pgdat_reclaimable(struct pglist_data *pgdat) + pgdat_reclaimable_pages(pgdat) * 6; + } + +-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) ++/* ++ * Return size of lru list zones[0..classzone_idx] if memcg is disabled. ++ */ ++unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, ++ int classzone_idx) + { ++ struct pglist_data *pgdat; ++ unsigned long nr_pages, nr_zone_pages; ++ int zid; ++ struct zone *zone; ++ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_lru_size(lruvec, lru); + +- return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); ++ pgdat = lruvec_pgdat(lruvec); ++ nr_pages = node_page_state(pgdat, NR_LRU_BASE + lru); ++ ++ for (zid = classzone_idx + 1; zid < MAX_NR_ZONES; zid++) { ++ zone = &pgdat->node_zones[zid]; ++ if (!populated_zone(zone)) ++ continue; ++ ++ nr_zone_pages = zone_page_state(zone, NR_ZONE_LRU_BASE + lru); ++ nr_pages -= min(nr_pages, nr_zone_pages); ++ } ++ ++ return nr_pages; + } + + /* +@@ -1481,13 +1502,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + total_skipped += nr_skipped[zid]; + } + +- /* +- * Account skipped pages as a partial scan as the pgdat may be +- * close to unreclaimable. If the LRU list is empty, account +- * skipped pages as a full scan. +- */ +- scan += list_empty(src) ? total_skipped : total_skipped >> 2; +- + list_splice(&pages_skipped, src); + } + *nr_scanned = scan; +@@ -1995,34 +2009,9 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, + if (!file && !total_swap_pages) + return false; + +- inactive = lruvec_lru_size(lruvec, file * LRU_FILE); +- active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); +- +- /* +- * For global reclaim on zone-constrained allocations, it is necessary +- * to check if rotations are required for lowmem to be reclaimed. This +- * calculates the inactive/active pages available in eligible zones. +- */ +- if (global_reclaim(sc)) { +- struct pglist_data *pgdat = lruvec_pgdat(lruvec); +- int zid; +- +- for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { +- struct zone *zone = &pgdat->node_zones[zid]; +- unsigned long inactive_zone, active_zone; +- +- if (!populated_zone(zone)) +- continue; +- +- inactive_zone = zone_page_state(zone, +- NR_ZONE_LRU_BASE + (file * LRU_FILE)); +- active_zone = zone_page_state(zone, +- NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); +- +- inactive -= min(inactive, inactive_zone); +- active -= min(active, active_zone); +- } +- } ++ inactive = lruvec_lru_size(lruvec, file * LRU_FILE, sc->reclaim_idx); ++ active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE, ++ sc->reclaim_idx); + + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) +@@ -2136,21 +2125,20 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, + * anon pages. Try to detect this based on file LRU size. + */ + if (global_reclaim(sc)) { +- unsigned long pgdatfile; +- unsigned long pgdatfree; +- int z; ++ unsigned long pgdatfile = 0; ++ unsigned long pgdatfree = 0; + unsigned long total_high_wmark = 0; ++ int z; + +- pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); +- pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + +- node_page_state(pgdat, NR_INACTIVE_FILE); +- +- for (z = 0; z < MAX_NR_ZONES; z++) { ++ for (z = 0; z <= sc->reclaim_idx; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!populated_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); ++ pgdatfree += zone_page_state(zone, NR_FREE_PAGES); ++ pgdatfile += zone_page_state(zone, NR_ZONE_ACTIVE_FILE); ++ pgdatfile += zone_page_state(zone, NR_ZONE_INACTIVE_FILE); + } + + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { +@@ -2169,7 +2157,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, + * system is under heavy pressure. + */ + if (!inactive_list_is_low(lruvec, true, sc) && +- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { ++ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) ++ >> sc->priority) { + scan_balance = SCAN_FILE; + goto out; + } +@@ -2195,10 +2184,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, + * anon in [0], file in [1] + */ + +- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + +- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); +- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + +- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); ++ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, sc->reclaim_idx) + ++ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx); ++ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, sc->reclaim_idx) + ++ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx); + + spin_lock_irq(&pgdat->lru_lock); + if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { +@@ -2236,7 +2225,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, + unsigned long size; + unsigned long scan; + +- size = lruvec_lru_size(lruvec, lru); ++ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = size >> sc->priority; + + if (!scan && pass && force_scan) +diff --git a/mm/workingset.c b/mm/workingset.c +index 69551cf..0c71027 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -266,7 +266,7 @@ bool workingset_refault(void *shadow) + } + lruvec = mem_cgroup_lruvec(pgdat, memcg); + refault = atomic_long_read(&lruvec->inactive_age); +- active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); ++ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES - 1); + rcu_read_unlock(); + + /* +-- +1.9.1 diff --git a/a/content_digest b/N1/content_digest index 2267a59..1897511 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -8,8 +8,8 @@ "To\0Mel Gorman <mgorman@suse.de>\0" "Cc\0Andrew Morton <akpm@linux-foundation.org>" Johannes Weiner <hannes@cmpxchg.org> - linux-mm@kvack.org - linux-kernel@vger.kernel.org + <linux-mm@kvack.org> + <linux-kernel@vger.kernel.org> Michal Hocko <mhocko@kernel.org> " Vladimir Davydov <vdavydov@virtuozzo.com>\0" "\00:1\0" @@ -37,6 +37,229 @@ "> numbers of containers and require usage of kmemcg.\n" "\n" "Okay, Then how about this?\n" - I didn't test it but I guess it should work. + "I didn't test it but I guess it should work.\n" + "\n" + ">From 8f9f04045724d0d5752d100b94d060183e5a0f7d Mon Sep 17 00:00:00 2001\n" + "From: Minchan Kim <minchan@kernel.org>\n" + "Date: Tue, 26 Jul 2016 11:57:27 +0900\n" + "Subject: [PATCH] mm: get_scan_count consider reclaimable lru pages\n" + "\n" + "With node-lru, if there are enough reclaimable pages in highmem\n" + "but nothing in lowmem, VM can try to shrink inactive list although\n" + "the requested zone is lowmem.\n" + "\n" + "The problem is that if the inactive list is full of highmem pages then a\n" + "direct reclaimer searching for a lowmem page waste CPU scanning uselessly.\n" + "It just burns out CPU. Even, many direct reclaimers are stalled by\n" + "too_many_isolated if lots of parallel reclaimer are going on although\n" + "there are no reclaimable memory in inactive list.\n" + "\n" + "To solve the issue, get_scan_count should consider reclaimable\n" + "lru size.\n" + "\n" + "I tried the experiment 4 times in 32bit 2G 8 CPU KVM machine to get\n" + "elapsed time.\n" + "\n" + " hackbench 500 process 2\n" + "\n" + "= Old =\n" + "\n" + "1st: 289s 2nd: 310s 3rd: 112s 4th: 272s\n" + "\n" + "= Now =\n" + "\n" + "1st: 31s 2nd: 132s 3rd: 162s 4th: 50s\n" + "\n" + "Not-yet-Signed-off-by: Minchan Kim <minchan@kernel.org>\n" + "---\n" + " include/linux/mmzone.h | 3 +-\n" + " mm/vmscan.c | 91 ++++++++++++++++++++++----------------------------\n" + " mm/workingset.c | 2 +-\n" + " 3 files changed, 43 insertions(+), 53 deletions(-)\n" + "\n" + "diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h\n" + "index d572b78..87d186f 100644\n" + "--- a/include/linux/mmzone.h\n" + "+++ b/include/linux/mmzone.h\n" + "@@ -805,7 +805,8 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)\n" + " #endif\n" + " }\n" + " \n" + "-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);\n" + "+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,\n" + "+\t\t\t\t\tint classzone);\n" + " \n" + " #ifdef CONFIG_HAVE_MEMORY_PRESENT\n" + " void memory_present(int nid, unsigned long start, unsigned long end);\n" + "diff --git a/mm/vmscan.c b/mm/vmscan.c\n" + "index e5af357..c27e307 100644\n" + "--- a/mm/vmscan.c\n" + "+++ b/mm/vmscan.c\n" + "@@ -234,12 +234,33 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)\n" + " \t\tpgdat_reclaimable_pages(pgdat) * 6;\n" + " }\n" + " \n" + "-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)\n" + "+/*\n" + "+ * Return size of lru list zones[0..classzone_idx] if memcg is disabled.\n" + "+ */\n" + "+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,\n" + "+\t\t\t\tint classzone_idx)\n" + " {\n" + "+\tstruct pglist_data *pgdat;\n" + "+\tunsigned long nr_pages, nr_zone_pages;\n" + "+\tint zid;\n" + "+\tstruct zone *zone;\n" + "+\n" + " \tif (!mem_cgroup_disabled())\n" + " \t\treturn mem_cgroup_get_lru_size(lruvec, lru);\n" + " \n" + "-\treturn node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);\n" + "+\tpgdat = lruvec_pgdat(lruvec);\n" + "+\tnr_pages = node_page_state(pgdat, NR_LRU_BASE + lru);\n" + "+\n" + "+\tfor (zid = classzone_idx + 1; zid < MAX_NR_ZONES; zid++) {\n" + "+\t\tzone = &pgdat->node_zones[zid];\n" + "+\t\tif (!populated_zone(zone))\n" + "+\t\t\tcontinue;\n" + "+\n" + "+\t\tnr_zone_pages = zone_page_state(zone, NR_ZONE_LRU_BASE + lru);\n" + "+\t\tnr_pages -= min(nr_pages, nr_zone_pages);\n" + "+\t}\n" + "+\n" + "+\treturn nr_pages;\n" + " }\n" + " \n" + " /*\n" + "@@ -1481,13 +1502,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,\n" + " \t\t\ttotal_skipped += nr_skipped[zid];\n" + " \t\t}\n" + " \n" + "-\t\t/*\n" + "-\t\t * Account skipped pages as a partial scan as the pgdat may be\n" + "-\t\t * close to unreclaimable. If the LRU list is empty, account\n" + "-\t\t * skipped pages as a full scan.\n" + "-\t\t */\n" + "-\t\tscan += list_empty(src) ? total_skipped : total_skipped >> 2;\n" + "-\n" + " \t\tlist_splice(&pages_skipped, src);\n" + " \t}\n" + " \t*nr_scanned = scan;\n" + "@@ -1995,34 +2009,9 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,\n" + " \tif (!file && !total_swap_pages)\n" + " \t\treturn false;\n" + " \n" + "-\tinactive = lruvec_lru_size(lruvec, file * LRU_FILE);\n" + "-\tactive = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);\n" + "-\n" + "-\t/*\n" + "-\t * For global reclaim on zone-constrained allocations, it is necessary\n" + "-\t * to check if rotations are required for lowmem to be reclaimed. This\n" + "-\t * calculates the inactive/active pages available in eligible zones.\n" + "-\t */\n" + "-\tif (global_reclaim(sc)) {\n" + "-\t\tstruct pglist_data *pgdat = lruvec_pgdat(lruvec);\n" + "-\t\tint zid;\n" + "-\n" + "-\t\tfor (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {\n" + "-\t\t\tstruct zone *zone = &pgdat->node_zones[zid];\n" + "-\t\t\tunsigned long inactive_zone, active_zone;\n" + "-\n" + "-\t\t\tif (!populated_zone(zone))\n" + "-\t\t\t\tcontinue;\n" + "-\n" + "-\t\t\tinactive_zone = zone_page_state(zone,\n" + "-\t\t\t\t\tNR_ZONE_LRU_BASE + (file * LRU_FILE));\n" + "-\t\t\tactive_zone = zone_page_state(zone,\n" + "-\t\t\t\t\tNR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);\n" + "-\n" + "-\t\t\tinactive -= min(inactive, inactive_zone);\n" + "-\t\t\tactive -= min(active, active_zone);\n" + "-\t\t}\n" + "-\t}\n" + "+\tinactive = lruvec_lru_size(lruvec, file * LRU_FILE, sc->reclaim_idx);\n" + "+\tactive = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE,\n" + "+\t\t\t\tsc->reclaim_idx);\n" + " \n" + " \tgb = (inactive + active) >> (30 - PAGE_SHIFT);\n" + " \tif (gb)\n" + "@@ -2136,21 +2125,20 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,\n" + " \t * anon pages. Try to detect this based on file LRU size.\n" + " \t */\n" + " \tif (global_reclaim(sc)) {\n" + "-\t\tunsigned long pgdatfile;\n" + "-\t\tunsigned long pgdatfree;\n" + "-\t\tint z;\n" + "+\t\tunsigned long pgdatfile = 0;\n" + "+\t\tunsigned long pgdatfree = 0;\n" + " \t\tunsigned long total_high_wmark = 0;\n" + "+\t\tint z;\n" + " \n" + "-\t\tpgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);\n" + "-\t\tpgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +\n" + "-\t\t\t node_page_state(pgdat, NR_INACTIVE_FILE);\n" + "-\n" + "-\t\tfor (z = 0; z < MAX_NR_ZONES; z++) {\n" + "+\t\tfor (z = 0; z <= sc->reclaim_idx; z++) {\n" + " \t\t\tstruct zone *zone = &pgdat->node_zones[z];\n" + " \t\t\tif (!populated_zone(zone))\n" + " \t\t\t\tcontinue;\n" + " \n" + " \t\t\ttotal_high_wmark += high_wmark_pages(zone);\n" + "+\t\t\tpgdatfree += zone_page_state(zone, NR_FREE_PAGES);\n" + "+\t\t\tpgdatfile += zone_page_state(zone, NR_ZONE_ACTIVE_FILE);\n" + "+\t\t\tpgdatfile += zone_page_state(zone, NR_ZONE_INACTIVE_FILE);\n" + " \t\t}\n" + " \n" + " \t\tif (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {\n" + "@@ -2169,7 +2157,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,\n" + " \t * system is under heavy pressure.\n" + " \t */\n" + " \tif (!inactive_list_is_low(lruvec, true, sc) &&\n" + "-\t lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {\n" + "+\t lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx)\n" + "+\t\t\t\t\t\t>> sc->priority) {\n" + " \t\tscan_balance = SCAN_FILE;\n" + " \t\tgoto out;\n" + " \t}\n" + "@@ -2195,10 +2184,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,\n" + " \t * anon in [0], file in [1]\n" + " \t */\n" + " \n" + "-\tanon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +\n" + "-\t\tlruvec_lru_size(lruvec, LRU_INACTIVE_ANON);\n" + "-\tfile = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +\n" + "-\t\tlruvec_lru_size(lruvec, LRU_INACTIVE_FILE);\n" + "+\tanon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, sc->reclaim_idx) +\n" + "+\t\tlruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx);\n" + "+\tfile = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, sc->reclaim_idx) +\n" + "+\t\tlruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx);\n" + " \n" + " \tspin_lock_irq(&pgdat->lru_lock);\n" + " \tif (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {\n" + "@@ -2236,7 +2225,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,\n" + " \t\t\tunsigned long size;\n" + " \t\t\tunsigned long scan;\n" + " \n" + "-\t\t\tsize = lruvec_lru_size(lruvec, lru);\n" + "+\t\t\tsize = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);\n" + " \t\t\tscan = size >> sc->priority;\n" + " \n" + " \t\t\tif (!scan && pass && force_scan)\n" + "diff --git a/mm/workingset.c b/mm/workingset.c\n" + "index 69551cf..0c71027 100644\n" + "--- a/mm/workingset.c\n" + "+++ b/mm/workingset.c\n" + "@@ -266,7 +266,7 @@ bool workingset_refault(void *shadow)\n" + " \t}\n" + " \tlruvec = mem_cgroup_lruvec(pgdat, memcg);\n" + " \trefault = atomic_long_read(&lruvec->inactive_age);\n" + "-\tactive_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);\n" + "+\tactive_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES - 1);\n" + " \trcu_read_unlock();\n" + " \n" + " \t/*\n" + "-- \n" + 1.9.1 -6481b1c9051c4c73819a6a2dc3c463636efc10eb160aa03acc6a49ec2501e461 +d1ecc6b10ca6fa73ec63887e8d1feb1a6a3e03ab7174ed00de79170c28935d86
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.