[PATCH 1/2] mm: add skipped count and more lru information to trace

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] mm: add skipped count and more lru information to trace
@ 2016-07-27  7:29 Minchan Kim
  2016-07-27  7:29 ` [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages Minchan Kim
  0 siblings, 1 reply; 6+ messages in thread
From: Minchan Kim @ 2016-07-27  7:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Johannes Weiner, linux-mm, linux-kernel, Minchan Kim

The skipped count is important to investgate reclaim overhead problem
so let's add it. As well, isolation happens both active and inactive
lru list with decreasing priority so with that, it helps to identify
which lru list we are scanning at which priority now.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 .../trace/postprocess/trace-vmscan-postprocess.pl  | 10 +++---
 include/trace/events/vmscan.h                      | 39 ++++++++++++++++------
 mm/vmscan.c                                        |  6 ++--
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 8f961ef..461203d 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -112,7 +112,7 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
 my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
 my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
 my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)';
-my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) file=([0-9]*)';
+my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) priority=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) nr_skipped=([0-9]*) lru=([A-Z_]*)';
 my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
 my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
 my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
@@ -207,7 +207,7 @@ $regex_lru_isolate = generate_traceevent_regex(
 			$regex_lru_isolate_default,
 			"isolate_mode", "order",
 			"nr_requested", "nr_scanned", "nr_taken",
-			"file");
+			"nr_skipped", "lru");
 $regex_lru_shrink_inactive = generate_traceevent_regex(
 			"vmscan/mm_vmscan_lru_shrink_inactive",
 			$regex_lru_shrink_inactive_default,
@@ -381,8 +381,8 @@ sub process_events {
 				next;
 			}
 			my $isolate_mode = $1;
-			my $nr_scanned = $4;
-			my $file = $6;
+			my $nr_scanned = $5;
+			my $lru = $8;
 
 			# To closer match vmstat scanning statistics, only count isolate_both
 			# and isolate_inactive as scanning. isolate_active is rotation
@@ -391,7 +391,7 @@ sub process_events {
 			# isolate_both     == 3
 			if ($isolate_mode != 2) {
 				$perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned;
-				if ($file == 1) {
+				if ($lru =~ /file/) {
 					$perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned;
 				} else {
 					$perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned;
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index c88fd09..67f479d 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -273,55 +273,71 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 
 	TP_PROTO(int classzone_idx,
 		int order,
+		int priority,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
+		unsigned long nr_skipped,
 		isolate_mode_t isolate_mode,
-		int file),
+		enum lru_list lru),
 
-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
+	TP_ARGS(classzone_idx, order, priority, nr_requested, nr_scanned,
+		nr_taken, nr_skipped, isolate_mode, lru),
 
 	TP_STRUCT__entry(
 		__field(int, classzone_idx)
 		__field(int, order)
+		__field(int, priority)
 		__field(unsigned long, nr_requested)
 		__field(unsigned long, nr_scanned)
 		__field(unsigned long, nr_taken)
+		__field(unsigned long, nr_skipped)
 		__field(isolate_mode_t, isolate_mode)
-		__field(int, file)
+		__field(enum lru_list, lru)
 	),
 
 	TP_fast_assign(
 		__entry->classzone_idx = classzone_idx;
 		__entry->order = order;
+		__entry->priority = priority;
 		__entry->nr_requested = nr_requested;
 		__entry->nr_scanned = nr_scanned;
 		__entry->nr_taken = nr_taken;
+		__entry->nr_skipped = nr_skipped;
 		__entry->isolate_mode = isolate_mode;
-		__entry->file = file;
+		__entry->lru = lru;
 	),
 
-	TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
+	TP_printk("isolate_mode=%d classzone=%d order=%d priority=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu nr_skipped=%lu lru=%s",
 		__entry->isolate_mode,
 		__entry->classzone_idx,
 		__entry->order,
+		__entry->priority,
 		__entry->nr_requested,
 		__entry->nr_scanned,
 		__entry->nr_taken,
-		__entry->file)
+		__entry->nr_skipped,
+		__print_symbolic(__entry->lru,
+			{ LRU_INACTIVE_ANON, "ia_anon"},
+			{ LRU_ACTIVE_ANON, "ac_anon"},
+			{ LRU_INACTIVE_FILE, "ia_file"},
+			{ LRU_ACTIVE_FILE, "ac_file"}))
 );
 
 DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
 
 	TP_PROTO(int classzone_idx,
 		int order,
+		int priority,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
+		unsigned long nr_skipped,
 		isolate_mode_t isolate_mode,
-		int file),
+		enum lru_list lru),
 
-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+	TP_ARGS(classzone_idx, order, priority, nr_requested, nr_scanned,
+		nr_taken, nr_skipped, isolate_mode, lru)
 
 );
 
@@ -329,13 +345,16 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
 
 	TP_PROTO(int classzone_idx,
 		int order,
+		int priority,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
+		unsigned long nr_skipped,
 		isolate_mode_t isolate_mode,
-		int file),
+		enum lru_list lru),
 
-	TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+	TP_ARGS(classzone_idx, order, priority, nr_requested, nr_scanned,
+		nr_taken, nr_skipped, isolate_mode, lru)
 
 );
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5af357..f8ded2b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1421,6 +1421,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
 	unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
 	unsigned long scan, nr_pages;
+	unsigned long total_skipped = 0;
 	LIST_HEAD(pages_skipped);
 
 	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
@@ -1471,7 +1472,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	 */
 	if (!list_empty(&pages_skipped)) {
 		int zid;
-		unsigned long total_skipped = 0;
 
 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 			if (!nr_skipped[zid])
@@ -1491,8 +1491,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		list_splice(&pages_skipped, src);
 	}
 	*nr_scanned = scan;
-	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
-				    nr_taken, mode, is_file_lru(lru));
+	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, sc->priority,
+			nr_to_scan, scan, nr_taken, total_skipped, mode, lru);
 	update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
 	return nr_taken;
 }
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages
  2016-07-27  7:29 [PATCH 1/2] mm: add skipped count and more lru information to trace Minchan Kim
@ 2016-07-27  7:29 ` Minchan Kim
  2016-07-27 14:22   ` Mel Gorman
  0 siblings, 1 reply; 6+ messages in thread
From: Minchan Kim @ 2016-07-27  7:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Mel Gorman, Johannes Weiner, linux-mm, linux-kernel, Minchan Kim

With node-lru, if there are enough reclaimable pages in highmem
but nothing in lowmem, VM try to shrink inactive list although
the requested zone is lowmem.

The problem is that if the inactive list is full of highmem pages then a
direct reclaimer searching for a lowmem page waste CPU scanning uselessly.
It just burns out CPU.  Even, many direct reclaimers are stalled by
too_many_isolated if lots of parallel reclaimer are going on although
there are no reclaimable memory in inactive list.

With event trace

<...>-4719  [005] ....    57.341146: mm_vmscan_direct_reclaim_begin: order=0 may_writepage=1 gfp_flags=GFP_KERNEL|__GFP_NOWARN|__GFP_REPEAT|__GFP_NORETRY|__GFP_NOTRACK classzone_idx=1
<...>-4719  [005] ....    68.075454: writeback_congestion_wait: usec_timeout=100000 usec_delayed=108000
<...>-4719  [005] ....    68.181068: writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000
...
...
<...>-4719  [007] ....    71.488011: writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000
<...>-4719  [007] ....    71.592057: writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000
<...>-4719  [007] ....    71.696003: writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000
...
...
<...>-4719  [007] ....    71.696033: mm_vmscan_writepage: page=f5d0b720 pfn=20153 flags=RECLAIM_WB_ANON|RECLAIM_WB_ASYNC
<...>-4719  [007] ....    71.696040: mm_vmscan_writepage: page=f5d0b740 pfn=20154 flags=RECLAIM_WB_ANON|RECLAIM_WB_ASYNC
<...>-4719  [007] ....    71.696042: mm_vmscan_lru_shrink_inactive: nid=0 nr_scanned=5 nr_reclaimed=0 priority=12 flags=RECLAIM_WB_ANON|RECLAIM_WB_ASYNC
<...>-4719  [001] ....    71.799876: writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000
<...>-4719  [001] ....    71.903758: writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000
<...>-4719  [001] ....    72.007567: writeback_congestion_wait: usec_timeout=100000 usec_delayed=104000
...
...
<...>-4719  [001] d..1   138.621120: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=10 nr_requested=8 nr_scanned=67051 nr_taken=0 nr_skipped=67051 lru=ia_anon
<...>-4719  [001] d..1   138.621404: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=10 nr_requested=3 nr_scanned=4702 nr_taken=1 nr_skipped=4701 lru=ia_file
<...>-4719  [001] ....   142.357979: mm_vmscan_lru_shrink_inactive: nid=0 nr_scanned=4702 nr_reclaimed=0 priority=10 flags=RECLAIM_WB_FILE|RECLAIM_WB_ASYNC
<...>-4719  [001] d..1   142.361283: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=9 nr_requested=9 nr_scanned=4701 nr_taken=0 nr_skipped=4701 lru=ia_file
<...>-4719  [001] d..1   143.946212: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=8 nr_requested=18 nr_scanned=4701 nr_taken=0 nr_skipped=4701 lru=ia_file
<...>-4719  [001] d..1   143.948097: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=7 nr_requested=32 nr_scanned=4701 nr_taken=0 nr_skipped=4701 lru=ia_file
<...>-4719  [001] d..1   143.951069: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=7 nr_requested=4 nr_scanned=4701 nr_taken=0 nr_skipped=4701 lru=ia_file
<...>-4719  [001] d..1   146.049489: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=6 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   146.050679: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=6 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   146.055146: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=6 nr_requested=11 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   148.475456: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=5 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   148.476377: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=5 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   148.477951: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=5 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   148.480876: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=5 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   151.050139: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=5 nr_requested=23 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   155.143851: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=32 nr_scanned=60307 nr_taken=0 nr_skipped=60307 lru=ia_anon
<...>-4719  [001] d..1   155.148623: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=32 nr_scanned=79109 nr_taken=0 nr_skipped=79109 lru=ac_anon
<...>-4719  [001] d..1   161.197558: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   161.203632: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=32 nr_scanned=57390 nr_taken=0 nr_skipped=57390 lru=ia_anon
<...>-4719  [001] d..1   165.029420: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=32 nr_scanned=73883 nr_taken=0 nr_skipped=73883 lru=ac_anon
<...>-4719  [001] d..1   166.967292: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=32 nr_scanned=4843 nr_taken=0 nr_skipped=4843 lru=ia_file
<...>-4719  [001] d..1   166.971143: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=15 nr_scanned=956 nr_taken=0 nr_skipped=956 lru=ac_file
<...>-4719  [001] d..1   171.223812: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=4 nr_requested=32 nr_scanned=55048 nr_taken=0 nr_skipped=55048 lru=ia_anon
...
...
<...>-4719  [001] d..1   328.487879: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=13338 nr_taken=0 nr_skipped=13338 lru=ia_anon
<...>-4719  [001] d..1   328.727392: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=13327 nr_taken=0 nr_skipped=13327 lru=ia_anon
<...>-4719  [001] d..1   328.742644: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=13327 nr_taken=0 nr_skipped=13327 lru=ia_anon
<...>-4719  [001] d..1   329.108697: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=13327 nr_taken=0 nr_skipped=13327 lru=ia_anon
<...>-4719  [001] d..1   329.352961: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=13329 nr_taken=0 nr_skipped=13329 lru=ia_anon
<...>-4719  [001] d..1   329.860778: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=13090 nr_taken=0 nr_skipped=13090 lru=ia_anon
<...>-4719  [001] d..1   329.864116: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=13090 nr_taken=0 nr_skipped=13090 lru=ia_anon
<...>-4719  [001] d..1   330.166590: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12884 nr_taken=0 nr_skipped=12884 lru=ia_anon
<...>-4719  [001] d..1   330.173238: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12884 nr_taken=0 nr_skipped=12884 lru=ia_anon
<...>-4719  [001] d..1   330.461032: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12645 nr_taken=0 nr_skipped=12645 lru=ia_anon
<...>-4719  [001] d..1   330.462362: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12645 nr_taken=0 nr_skipped=12645 lru=ia_anon
<...>-4719  [001] d..1   330.476324: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12645 nr_taken=0 nr_skipped=12645 lru=ia_anon
<...>-4719  [001] d..1   330.880941: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12421 nr_taken=0 nr_skipped=12421 lru=ia_anon
<...>-4719  [001] d..1   330.885534: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12410 nr_taken=0 nr_skipped=12410 lru=ia_anon
<...>-4719  [001] d..1   331.030369: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12384 nr_taken=0 nr_skipped=12384 lru=ia_anon
<...>-4719  [001] d..1   331.410396: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12384 nr_taken=0 nr_skipped=12384 lru=ia_anon
<...>-4719  [001] d..1   331.412967: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12384 nr_taken=0 nr_skipped=12384 lru=ia_anon
<...>-4719  [001] d..1   331.455933: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12384 nr_taken=0 nr_skipped=12384 lru=ia_anon
<...>-4719  [001] d..1   331.456604: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=32 nr_scanned=12384 nr_taken=0 nr_skipped=12384 lru=ia_anon
<...>-4719  [001] d..1   331.462208: mm_vmscan_lru_isolate: isolate_mode=0 classzone=1 order=0 priority=2 nr_requested=25 nr_scanned=12384 nr_taken=0 nr_skipped=12384 lru=ia_anon
<...>-4719  [001] ....   331.580535: mm_vmscan_direct_reclaim_end: nr_reclaimed=113

To solve the issue, get_scan_count should consider zone-reclaimable lru
size in case of constrained-alloc rather than node-lru size so it should
not scan lru list if there is no reclaimable pages in lowmem area.

Another optimization is to avoid too many stall in too_many_isolated loop
if there isn't any reclaimable page any more.

This patch reduces hackbench elapsed time from 400sec to 50sec.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/mmzone.h |   3 +-
 mm/vmscan.c            | 128 +++++++++++++++++++++++++++++--------------------
 mm/workingset.c        |   3 +-
 3 files changed, 81 insertions(+), 53 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d572b78..87d186f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -805,7 +805,8 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #endif
 }
 
-extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
+extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+					int classzone);
 
 #ifdef CONFIG_HAVE_MEMORY_PRESENT
 void memory_present(int nid, unsigned long start, unsigned long end);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8ded2b..f553fd8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,12 +234,33 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
 		pgdat_reclaimable_pages(pgdat) * 6;
 }
 
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/*
+ * Return size of lru list zones[0..classzone_idx] if memcg is disabled.
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+				int classzone_idx)
 {
+	struct pglist_data *pgdat;
+	unsigned long nr_pages, nr_zone_pages;
+	int zid;
+	struct zone *zone;
+
 	if (!mem_cgroup_disabled())
 		return mem_cgroup_get_lru_size(lruvec, lru);
 
-	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+	pgdat = lruvec_pgdat(lruvec);
+	nr_pages = node_page_state(pgdat, NR_LRU_BASE + lru);
+
+	for (zid = classzone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+		zone = &pgdat->node_zones[zid];
+		if (!populated_zone(zone))
+			continue;
+
+		nr_zone_pages = zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
+		nr_pages -= min(nr_pages, nr_zone_pages);
+	}
+
+	return nr_pages;
 }
 
 /*
@@ -1481,13 +1502,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 			total_skipped += nr_skipped[zid];
 		}
 
-		/*
-		 * Account skipped pages as a partial scan as the pgdat may be
-		 * close to unreclaimable. If the LRU list is empty, account
-		 * skipped pages as a full scan.
-		 */
-		scan += list_empty(src) ? total_skipped : total_skipped >> 2;
-
 		list_splice(&pages_skipped, src);
 	}
 	*nr_scanned = scan;
@@ -1652,6 +1666,30 @@ static int current_may_throttle(void)
 		bdi_write_congested(current->backing_dev_info);
 }
 
+static bool inactive_reclaimable_pages(struct lruvec *lruvec,
+				struct scan_control *sc, enum lru_list lru)
+{
+	int zid;
+	struct zone *zone;
+	int file = is_file_lru(lru);
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+	if (!global_reclaim(sc))
+		return true;
+
+	for (zid = sc->reclaim_idx; zid >= 0; zid--) {
+		zone = &pgdat->node_zones[zid];
+		if (!populated_zone(zone))
+			continue;
+
+		if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
+				LRU_FILE * file) >= SWAP_CLUSTER_MAX)
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
@@ -1674,12 +1712,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
+	/*
+	 * Although get_scan_count tell us it's worth to scan, there
+	 * would be no reclaimalble pages in the list if parallel
+	 * reclaimers already isolated them.
+	 */
+	if (!inactive_reclaimable_pages(lruvec, sc, lru))
+		return 0;
+
 	while (unlikely(too_many_isolated(pgdat, file, sc))) {
 		congestion_wait(BLK_RW_ASYNC, HZ/10);
 
 		/* We are about to die and free our memory. Return now. */
 		if (fatal_signal_pending(current))
 			return SWAP_CLUSTER_MAX;
+
+		if (!inactive_reclaimable_pages(lruvec, sc, lru))
+			return 0;
 	}
 
 	lru_add_drain();
@@ -1995,34 +2044,9 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	if (!file && !total_swap_pages)
 		return false;
 
-	inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-	active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
-	/*
-	 * For global reclaim on zone-constrained allocations, it is necessary
-	 * to check if rotations are required for lowmem to be reclaimed. This
-	 * calculates the inactive/active pages available in eligible zones.
-	 */
-	if (global_reclaim(sc)) {
-		struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-		int zid;
-
-		for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
-			struct zone *zone = &pgdat->node_zones[zid];
-			unsigned long inactive_zone, active_zone;
-
-			if (!populated_zone(zone))
-				continue;
-
-			inactive_zone = zone_page_state(zone,
-					NR_ZONE_LRU_BASE + (file * LRU_FILE));
-			active_zone = zone_page_state(zone,
-					NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
-
-			inactive -= min(inactive, inactive_zone);
-			active -= min(active, active_zone);
-		}
-	}
+	inactive = lruvec_lru_size(lruvec, file * LRU_FILE, sc->reclaim_idx);
+	active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE,
+				sc->reclaim_idx);
 
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
@@ -2136,21 +2160,22 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * anon pages.  Try to detect this based on file LRU size.
 	 */
 	if (global_reclaim(sc)) {
-		unsigned long pgdatfile;
-		unsigned long pgdatfree;
-		int z;
+		unsigned long pgdatfile = 0;
+		unsigned long pgdatfree = 0;
 		unsigned long total_high_wmark = 0;
+		int z;
 
-		pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
-		pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
-			   node_page_state(pgdat, NR_INACTIVE_FILE);
-
-		for (z = 0; z < MAX_NR_ZONES; z++) {
+		for (z = 0; z <= sc->reclaim_idx; z++) {
 			struct zone *zone = &pgdat->node_zones[z];
 			if (!populated_zone(zone))
 				continue;
 
 			total_high_wmark += high_wmark_pages(zone);
+			pgdatfree += zone_page_state(zone, NR_FREE_PAGES);
+			pgdatfile += zone_page_state(zone,
+						NR_ZONE_ACTIVE_FILE);
+			pgdatfile += zone_page_state(zone,
+						NR_ZONE_INACTIVE_FILE);
 		}
 
 		if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
@@ -2169,7 +2194,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * system is under heavy pressure.
 	 */
 	if (!inactive_list_is_low(lruvec, true, sc) &&
-	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx)
+						>> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -2195,10 +2221,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * anon in [0], file in [1]
 	 */
 
-	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, sc->reclaim_idx) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx);
+	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, sc->reclaim_idx) +
+		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx);
 
 	spin_lock_irq(&pgdat->lru_lock);
 	if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2236,7 +2262,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 			unsigned long size;
 			unsigned long scan;
 
-			size = lruvec_lru_size(lruvec, lru);
+			size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
 			scan = size >> sc->priority;
 
 			if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index 69551cf..2342265 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -266,7 +266,8 @@ bool workingset_refault(void *shadow)
 	}
 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
 	refault = atomic_long_read(&lruvec->inactive_age);
-	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE,
+					MAX_NR_ZONES - 1);
 	rcu_read_unlock();
 
 	/*
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages
  2016-07-27  7:29 ` [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages Minchan Kim
@ 2016-07-27 14:22   ` Mel Gorman
  2016-07-27 15:13     ` Mel Gorman
  2016-07-28  1:18     ` Minchan Kim
  0 siblings, 2 replies; 6+ messages in thread
From: Mel Gorman @ 2016-07-27 14:22 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Andrew Morton, Johannes Weiner, linux-mm, linux-kernel

On Wed, Jul 27, 2016 at 04:29:48PM +0900, Minchan Kim wrote:
> With node-lru, if there are enough reclaimable pages in highmem
> but nothing in lowmem, VM try to shrink inactive list although
> the requested zone is lowmem.
> 
> The problem is that if the inactive list is full of highmem pages then a
> direct reclaimer searching for a lowmem page waste CPU scanning uselessly.
> It just burns out CPU.  Even, many direct reclaimers are stalled by
> too_many_isolated if lots of parallel reclaimer are going on although
> there are no reclaimable memory in inactive list.
> 

The too_many_isolated point is interesting because the fact we
congestion_wait in there is daft. Too many isolated LRU pages has nothing
to do with congestion or dirty pages. More on that later

> To solve the issue, get_scan_count should consider zone-reclaimable lru
> size in case of constrained-alloc rather than node-lru size so it should
> not scan lru list if there is no reclaimable pages in lowmem area.
> 
> Another optimization is to avoid too many stall in too_many_isolated loop
> if there isn't any reclaimable page any more.
> 

That should be split into a separate patch, particularly if
too_many_isolated is altered to avoid congestion_wait.

> This patch reduces hackbench elapsed time from 400sec to 50sec.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>

Incidentally, this does not apply to mmots (note mmots and not mmotm)
due to other patches that have been picked up in the meantime. It needs
to be rebased.

I had trouble replicating your exact results. I do not know if this is
because we used a different baseline (I had to revert patches and do
some fixups to apply yours) or whether we have different versions of
hackbench. The version I'm using uses 40 processes per group, how many
does yours use?

> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index d572b78..87d186f 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -805,7 +805,8 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
>  #endif
>  }
>  
> -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
> +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
> +					int classzone);
>  

Use reclaim_idx as it's sc->reclaim_idx that is passed in. Lets not
reintroduce any confusion between classzone_idx and reclaim_idx.

>  #ifdef CONFIG_HAVE_MEMORY_PRESENT
>  void memory_present(int nid, unsigned long start, unsigned long end);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index f8ded2b..f553fd8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -234,12 +234,33 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
>  		pgdat_reclaimable_pages(pgdat) * 6;
>  }
>  
> -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
> +/*
> + * Return size of lru list zones[0..classzone_idx] if memcg is disabled.
> + */
> +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
> +				int classzone_idx)
>  {
> +	struct pglist_data *pgdat;
> +	unsigned long nr_pages, nr_zone_pages;
> +	int zid;
> +	struct zone *zone;
> +
>  	if (!mem_cgroup_disabled())
>  		return mem_cgroup_get_lru_size(lruvec, lru);
>  
> -	return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
> +	pgdat = lruvec_pgdat(lruvec);
> +	nr_pages = node_page_state(pgdat, NR_LRU_BASE + lru);
> +
> +	for (zid = classzone_idx + 1; zid < MAX_NR_ZONES; zid++) {
> +		zone = &pgdat->node_zones[zid];
> +		if (!populated_zone(zone))
> +			continue;
> +
> +		nr_zone_pages = zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
> +		nr_pages -= min(nr_pages, nr_zone_pages);
> +	}
> +
> +	return nr_pages;
>  }
>  
>  /*

Ok.

> @@ -1481,13 +1502,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
>  			total_skipped += nr_skipped[zid];
>  		}
>  
> -		/*
> -		 * Account skipped pages as a partial scan as the pgdat may be
> -		 * close to unreclaimable. If the LRU list is empty, account
> -		 * skipped pages as a full scan.
> -		 */
> -		scan += list_empty(src) ? total_skipped : total_skipped >> 2;
> -
>  		list_splice(&pages_skipped, src);
>  	}
>  	*nr_scanned = scan;

It's not clear why this is removed. Minimally, there is a race between
when lruvec_lru_size is checked and when the pages are isolated that can
empty the LRU lists in the meantime. Furthermore, if the lists are small
then it still makes sense to account for skipped pages as partial scans
to ensure OOM detection happens. 

> @@ -1652,6 +1666,30 @@ static int current_may_throttle(void)
>  		bdi_write_congested(current->backing_dev_info);
>  }
>  
> +static bool inactive_reclaimable_pages(struct lruvec *lruvec,
> +				struct scan_control *sc, enum lru_list lru)
> +{
> +	int zid;
> +	struct zone *zone;
> +	int file = is_file_lru(lru);
> +	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> +
> +	if (!global_reclaim(sc))
> +		return true;
> +
> 

When you rebase, it should be clear that this check can disappear.

> +	for (zid = sc->reclaim_idx; zid >= 0; zid--) {
> +		zone = &pgdat->node_zones[zid];
> +		if (!populated_zone(zone))
> +			continue;
> +
> +		if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
> +				LRU_FILE * file) >= SWAP_CLUSTER_MAX)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  /*
>   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
>   * of reclaimed pages
> @@ -1674,12 +1712,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
>  	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
>  
> +	/*
> +	 * Although get_scan_count tell us it's worth to scan, there
> +	 * would be no reclaimalble pages in the list if parallel
> +	 * reclaimers already isolated them.
> +	 */
> +	if (!inactive_reclaimable_pages(lruvec, sc, lru))
> +		return 0;
> +
>  	while (unlikely(too_many_isolated(pgdat, file, sc))) {
>  		congestion_wait(BLK_RW_ASYNC, HZ/10);
>  
>  		/* We are about to die and free our memory. Return now. */
>  		if (fatal_signal_pending(current))
>  			return SWAP_CLUSTER_MAX;
> +
> +		if (!inactive_reclaimable_pages(lruvec, sc, lru))
> +			return 0;
>  	}
>  
>  	lru_add_drain();

I think it makes sense to fix this loop first before putting that check
in. I'll post a candidate patch below that arguably should be merged
before this one.

The rest looked ok but I haven't tested it in depth. I'm gathering a
baseline set of results based on mmots at the moment and so should be
ready when/if v2 of this patch arrives.

I'd also like you to consider the following for applying first.

---8<---
From: Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH] mm, vmscan: Wait on a waitqueue when too many pages are
 isolated

When too many pages are isolated, direct reclaim waits on congestion to
clear for up to a tenth of a second. There is no reason to believe that too
many pages are isolated due to dirty pages, reclaim efficiency or congestion.
It may simply be because an extremely large number of processes have entered
direct reclaim at the same time.

This patch has processes wait on a waitqueue when too many pages are
isolated.  When parallel reclaimers finish shrink_page_list, they wake the
waiters to recheck whether too many pages are isolated. While it is difficult
to trigger this corner case, it's possible by lauching an extremely large
number of hackbench processes on a 32-bit system with limited memory. Without
the patch, a large number of processes wait uselessly and with the patch
applied, I was unable to stall the system.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 87d186fe60b4..510e074e2f7f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -653,6 +653,7 @@ typedef struct pglist_data {
 	int node_id;
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
+	wait_queue_head_t isolated_wait;
 	struct task_struct *kswapd;	/* Protected by
 					   mem_hotplug_begin/end() */
 	int kswapd_order;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fbd329e61bf6..3800972f240e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5859,6 +5859,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
+	init_waitqueue_head(&pgdat->isolated_wait);
 #ifdef CONFIG_COMPACTION
 	init_waitqueue_head(&pgdat->kcompactd_wait);
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f553fd8597e9..60ba22a8bf1f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1568,16 +1568,16 @@ int isolate_lru_page(struct page *page)
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
  */
-static int too_many_isolated(struct pglist_data *pgdat, int file,
+static bool safe_to_isolate(struct pglist_data *pgdat, int file,
 		struct scan_control *sc)
 {
 	unsigned long inactive, isolated;
 
 	if (current_is_kswapd())
-		return 0;
+		return true;
 
-	if (!sane_reclaim(sc))
-		return 0;
+	if (sane_reclaim(sc))
+		return true;
 
 	if (file) {
 		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
@@ -1595,7 +1595,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
 	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
 		inactive >>= 3;
 
-	return isolated > inactive;
+	return isolated < inactive;
 }
 
 static noinline_for_stack void
@@ -1720,8 +1720,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (!inactive_reclaimable_pages(lruvec, sc, lru))
 		return 0;
 
-	while (unlikely(too_many_isolated(pgdat, file, sc))) {
-		congestion_wait(BLK_RW_ASYNC, HZ/10);
+	if (!safe_to_isolate(pgdat, file, sc)) {
+		wait_event_killable(pgdat->isolated_wait,
+			safe_to_isolate(pgdat, file, sc));
 
 		/* We are about to die and free our memory. Return now. */
 		if (fatal_signal_pending(current))
@@ -1763,6 +1764,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 				&nr_writeback, &nr_immediate,
 				false);
 
+	wake_up_all(&pgdat->isolated_wait);
+
 	spin_lock_irq(&pgdat->lru_lock);
 
 	if (global_reclaim(sc)) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages
  2016-07-27 14:22   ` Mel Gorman
@ 2016-07-27 15:13     ` Mel Gorman
  2016-07-28  1:26       ` Minchan Kim
  2016-07-28  1:18     ` Minchan Kim
  1 sibling, 1 reply; 6+ messages in thread
From: Mel Gorman @ 2016-07-27 15:13 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Andrew Morton, Johannes Weiner, linux-mm, linux-kernel

On Wed, Jul 27, 2016 at 03:22:26PM +0100, Mel Gorman wrote:
> ---8<---
> From: Mel Gorman <mgorman@techsingularity.net>
> Subject: [PATCH] mm, vmscan: Wait on a waitqueue when too many pages are
>  isolated
> 

This is potentially a much better version as it avoids wakeup storms and
do a better job of handling the case where pages could not be reclaimed.

---8<---
mm, vmscan: Wait on a waitqueue when too many pages are isolated

When too many pages are isolated, direct reclaim waits on congestion to
clear for up to a tenth of a second. There is no reason to believe that too
many pages are isolated due to dirty pages, reclaim efficiency or congestion.
It may simply be because an extremely large number of processes have entered
direct reclaim at the same time.

This patch has processes wait on a waitqueue when too many pages are
isolated.  When parallel reclaimers finish shrink_page_list, they wake the
waiters to recheck whether too many pages are isolated. While it is difficult
to trigger this corner case, it's possible by lauching an extremely large
number of hackbench processes on a 32-bit system with limited memory. Without
the patch, a large number of processes wait uselessly and with the patch
applied, I was unable to stall the system.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        |  1 +
 mm/vmscan.c            | 24 +++++++++++++++---------
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d572b78b65e1..467878d7af33 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -653,6 +653,7 @@ typedef struct pglist_data {
 	int node_id;
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
+	wait_queue_head_t isolated_wait;
 	struct task_struct *kswapd;	/* Protected by
 					   mem_hotplug_begin/end() */
 	int kswapd_order;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fbd329e61bf6..3800972f240e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5859,6 +5859,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 #endif
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
+	init_waitqueue_head(&pgdat->isolated_wait);
 #ifdef CONFIG_COMPACTION
 	init_waitqueue_head(&pgdat->kcompactd_wait);
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c0b2b0fc164..e264fcb7556b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1554,16 +1554,16 @@ int isolate_lru_page(struct page *page)
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
  */
-static int too_many_isolated(struct pglist_data *pgdat, int file,
+static bool safe_to_isolate(struct pglist_data *pgdat, int file,
 		struct scan_control *sc)
 {
 	unsigned long inactive, isolated;
 
 	if (current_is_kswapd())
-		return 0;
+		return true;
 
-	if (!sane_reclaim(sc))
-		return 0;
+	if (sane_reclaim(sc))
+		return true;
 
 	if (file) {
 		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
@@ -1581,7 +1581,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
 	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
 		inactive >>= 3;
 
-	return isolated > inactive;
+	return isolated < inactive;
 }
 
 static noinline_for_stack void
@@ -1701,12 +1701,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (!inactive_reclaimable_pages(lruvec, sc, lru))
 		return 0;
 
-	while (unlikely(too_many_isolated(pgdat, file, sc))) {
-		congestion_wait(BLK_RW_ASYNC, HZ/10);
+	if (!safe_to_isolate(pgdat, file, sc)) {
+		wait_event_killable(pgdat->isolated_wait,
+			safe_to_isolate(pgdat, file, sc));
 
 		/* We are about to die and free our memory. Return now. */
-		if (fatal_signal_pending(current))
-			return SWAP_CLUSTER_MAX;
+		if (fatal_signal_pending(current)) {
+			nr_reclaimed = SWAP_CLUSTER_MAX;
+			goto out;
+		}
 	}
 
 	lru_add_drain();
@@ -1819,6 +1822,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
 			nr_scanned, nr_reclaimed,
 			sc->priority, file);
+
+out:
+	wake_up(&pgdat->isolated_wait);
 	return nr_reclaimed;
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages
  2016-07-27 14:22   ` Mel Gorman
  2016-07-27 15:13     ` Mel Gorman
@ 2016-07-28  1:18     ` Minchan Kim
  1 sibling, 0 replies; 6+ messages in thread
From: Minchan Kim @ 2016-07-28  1:18 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Johannes Weiner, linux-mm,
	linux-kernel@vger.kernel.org

On Wed, Jul 27, 2016 at 03:22:26PM +0100, Mel Gorman wrote:
> On Wed, Jul 27, 2016 at 04:29:48PM +0900, Minchan Kim wrote:
> > With node-lru, if there are enough reclaimable pages in highmem
> > but nothing in lowmem, VM try to shrink inactive list although
> > the requested zone is lowmem.
> >
> > The problem is that if the inactive list is full of highmem pages then a
> > direct reclaimer searching for a lowmem page waste CPU scanning uselessly.
> > It just burns out CPU.  Even, many direct reclaimers are stalled by
> > too_many_isolated if lots of parallel reclaimer are going on although
> > there are no reclaimable memory in inactive list.
> >
>
> The too_many_isolated point is interesting because the fact we
> congestion_wait in there is daft. Too many isolated LRU pages has nothing
> to do with congestion or dirty pages. More on that later

Agree.

>
> > To solve the issue, get_scan_count should consider zone-reclaimable lru
> > size in case of constrained-alloc rather than node-lru size so it should
> > not scan lru list if there is no reclaimable pages in lowmem area.
> >
> > Another optimization is to avoid too many stall in too_many_isolated loop
> > if there isn't any reclaimable page any more.
> >
>
> That should be split into a separate patch, particularly if
> too_many_isolated is altered to avoid congestion_wait.
>
> > This patch reduces hackbench elapsed time from 400sec to 50sec.
> >
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
>
> Incidentally, this does not apply to mmots (note mmots and not mmotm)
> due to other patches that have been picked up in the meantime. It needs
> to be rebased.

Will check it.

>
> I had trouble replicating your exact results. I do not know if this is
> because we used a different baseline (I had to revert patches and do
> some fixups to apply yours) or whether we have different versions of
> hackbench. The version I'm using uses 40 processes per group, how many
> does yours use?

My baseline was mmotm-2016-07-21-15-11 + "mm, vmscan: remove highmem_file_pages -fix"
The command I used is hackbench -l 1 -g 500 -P but the result is same
with the hackbench mmtest includes(hackbench 500 process 1).
Of course, default process is 40 per group so the command will create
task 2000 processes.
My i386 guest os has 8 processor and 2G memory and host 6 CPU with
3.2GHz.

>
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index d572b78..87d186f 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -805,7 +805,8 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
> >  #endif
> >  }
> >  
> > -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
> > +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
> > + int classzone);
> >  
>
> Use reclaim_idx as it's sc->reclaim_idx that is passed in. Lets not
> reintroduce any confusion between classzone_idx and reclaim_idx.
>
> >  #ifdef CONFIG_HAVE_MEMORY_PRESENT
> >  void memory_present(int nid, unsigned long start, unsigned long end);
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index f8ded2b..f553fd8 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -234,12 +234,33 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
> >   pgdat_reclaimable_pages(pgdat) * 6;
> >  }
> >  
> > -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
> > +/*
> > + * Return size of lru list zones[0..classzone_idx] if memcg is disabled.
> > + */
> > +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
> > + int classzone_idx)
> >  {
> > + struct pglist_data *pgdat;
> > + unsigned long nr_pages, nr_zone_pages;
> > + int zid;
> > + struct zone *zone;
> > +
> >   if (!mem_cgroup_disabled())
> >   return mem_cgroup_get_lru_size(lruvec, lru);
> >  
> > - return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
> > + pgdat = lruvec_pgdat(lruvec);
> > + nr_pages = node_page_state(pgdat, NR_LRU_BASE + lru);
> > +
> > + for (zid = classzone_idx + 1; zid < MAX_NR_ZONES; zid++) {
> > + zone = &pgdat->node_zones[zid];
> > + if (!populated_zone(zone))
> > + continue;
> > +
> > + nr_zone_pages = zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
> > + nr_pages -= min(nr_pages, nr_zone_pages);
> > + }
> > +
> > + return nr_pages;
> >  }
> >  
> >  /*
>
> Ok.
>
> > @@ -1481,13 +1502,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
> >   total_skipped += nr_skipped[zid];
> >   }
> >  
> > - /*
> > - * Account skipped pages as a partial scan as the pgdat may be
> > - * close to unreclaimable. If the LRU list is empty, account
> > - * skipped pages as a full scan.
> > - */
> > - scan += list_empty(src) ? total_skipped : total_skipped >> 2;
> > -
> >   list_splice(&pages_skipped, src);
> >   }
> >   *nr_scanned = scan;
>
> It's not clear why this is removed. Minimally, there is a race between
> when lruvec_lru_size is checked and when the pages are isolated that can
> empty the LRU lists in the meantime. Furthermore, if the lists are small
> then it still makes sense to account for skipped pages as partial scans
> to ensure OOM detection happens.

I will revmoe the part because it's not related to the problem I am
addressing.

>
> > @@ -1652,6 +1666,30 @@ static int current_may_throttle(void)
> >   bdi_write_congested(current->backing_dev_info);
> >  }
> >  
> > +static bool inactive_reclaimable_pages(struct lruvec *lruvec,
> > + struct scan_control *sc, enum lru_list lru)
> > +{
> > + int zid;
> > + struct zone *zone;
> > + int file = is_file_lru(lru);
> > + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> > +
> > + if (!global_reclaim(sc))
> > + return true;
> > +
> >
>
> When you rebase, it should be clear that this check can disappear.

Sure..

>
> > + for (zid = sc->reclaim_idx; zid >= 0; zid--) {
> > + zone = &pgdat->node_zones[zid];
> > + if (!populated_zone(zone))
> > + continue;
> > +
> > + if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
> > + LRU_FILE * file) >= SWAP_CLUSTER_MAX)
> > + return true;
> > + }
> > +
> > + return false;
> > +}
> > +
> >  /*
> >   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
> >   * of reclaimed pages
> > @@ -1674,12 +1712,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
> >   struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> >   struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
> >  
> > + /*
> > + * Although get_scan_count tell us it's worth to scan, there
> > + * would be no reclaimalble pages in the list if parallel
> > + * reclaimers already isolated them.
> > + */
> > + if (!inactive_reclaimable_pages(lruvec, sc, lru))
> > + return 0;
> > +
> >   while (unlikely(too_many_isolated(pgdat, file, sc))) {
> >   congestion_wait(BLK_RW_ASYNC, HZ/10);
> >  
> >   /* We are about to die and free our memory. Return now. */
> >   if (fatal_signal_pending(current))
> >   return SWAP_CLUSTER_MAX;
> > +
> > + if (!inactive_reclaimable_pages(lruvec, sc, lru))
> > + return 0;
> >   }
> >  
> >   lru_add_drain();
>
> I think it makes sense to fix this loop first before putting that check
> in. I'll post a candidate patch below that arguably should be merged
> before this one.

I don't care which patch should be first but don't mind if you want.

>
> The rest looked ok but I haven't tested it in depth. I'm gathering a
> baseline set of results based on mmots at the moment and so should be
> ready when/if v2 of this patch arrives.
>
> I'd also like you to consider the following for applying first.

Will revmoe your recent version.

Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages
  2016-07-27 15:13     ` Mel Gorman
@ 2016-07-28  1:26       ` Minchan Kim
  0 siblings, 0 replies; 6+ messages in thread
From: Minchan Kim @ 2016-07-28  1:26 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Andrew Morton, Johannes Weiner, linux-mm, linux-kernel

On Wed, Jul 27, 2016 at 04:13:33PM +0100, Mel Gorman wrote:
> On Wed, Jul 27, 2016 at 03:22:26PM +0100, Mel Gorman wrote:
> > ---8<---
> > From: Mel Gorman <mgorman@techsingularity.net>
> > Subject: [PATCH] mm, vmscan: Wait on a waitqueue when too many pages are
> >  isolated
> > 
> 
> This is potentially a much better version as it avoids wakeup storms and
> do a better job of handling the case where pages could not be reclaimed.
> 
> ---8<---
> mm, vmscan: Wait on a waitqueue when too many pages are isolated
> 
> When too many pages are isolated, direct reclaim waits on congestion to
> clear for up to a tenth of a second. There is no reason to believe that too
> many pages are isolated due to dirty pages, reclaim efficiency or congestion.
> It may simply be because an extremely large number of processes have entered
> direct reclaim at the same time.
> 
> This patch has processes wait on a waitqueue when too many pages are
> isolated.  When parallel reclaimers finish shrink_page_list, they wake the
> waiters to recheck whether too many pages are isolated. While it is difficult
> to trigger this corner case, it's possible by lauching an extremely large
> number of hackbench processes on a 32-bit system with limited memory. Without
> the patch, a large number of processes wait uselessly and with the patch
> applied, I was unable to stall the system.
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> ---
>  include/linux/mmzone.h |  1 +
>  mm/page_alloc.c        |  1 +
>  mm/vmscan.c            | 24 +++++++++++++++---------
>  3 files changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index d572b78b65e1..467878d7af33 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -653,6 +653,7 @@ typedef struct pglist_data {
>  	int node_id;
>  	wait_queue_head_t kswapd_wait;
>  	wait_queue_head_t pfmemalloc_wait;
> +	wait_queue_head_t isolated_wait;
>  	struct task_struct *kswapd;	/* Protected by
>  					   mem_hotplug_begin/end() */
>  	int kswapd_order;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fbd329e61bf6..3800972f240e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5859,6 +5859,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
>  #endif
>  	init_waitqueue_head(&pgdat->kswapd_wait);
>  	init_waitqueue_head(&pgdat->pfmemalloc_wait);
> +	init_waitqueue_head(&pgdat->isolated_wait);
>  #ifdef CONFIG_COMPACTION
>  	init_waitqueue_head(&pgdat->kcompactd_wait);
>  #endif
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9c0b2b0fc164..e264fcb7556b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1554,16 +1554,16 @@ int isolate_lru_page(struct page *page)
>   * the LRU list will go small and be scanned faster than necessary, leading to
>   * unnecessary swapping, thrashing and OOM.
>   */
> -static int too_many_isolated(struct pglist_data *pgdat, int file,
> +static bool safe_to_isolate(struct pglist_data *pgdat, int file,
>  		struct scan_control *sc)
>  {
>  	unsigned long inactive, isolated;
>  
>  	if (current_is_kswapd())
> -		return 0;
> +		return true;
>  
> -	if (!sane_reclaim(sc))
> -		return 0;
> +	if (sane_reclaim(sc))
> +		return true;
>  
>  	if (file) {
>  		inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> @@ -1581,7 +1581,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
>  	if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
>  		inactive >>= 3;
>  
> -	return isolated > inactive;
> +	return isolated < inactive;
>  }
>  
>  static noinline_for_stack void
> @@ -1701,12 +1701,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	if (!inactive_reclaimable_pages(lruvec, sc, lru))
>  		return 0;
>  
> -	while (unlikely(too_many_isolated(pgdat, file, sc))) {
> -		congestion_wait(BLK_RW_ASYNC, HZ/10);
> +	if (!safe_to_isolate(pgdat, file, sc)) {
> +		wait_event_killable(pgdat->isolated_wait,
> +			safe_to_isolate(pgdat, file, sc));
>  
>  		/* We are about to die and free our memory. Return now. */
> -		if (fatal_signal_pending(current))
> -			return SWAP_CLUSTER_MAX;
> +		if (fatal_signal_pending(current)) {
> +			nr_reclaimed = SWAP_CLUSTER_MAX;
> +			goto out;
> +		}
>  	}
>  
>  	lru_add_drain();
> @@ -1819,6 +1822,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
>  	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
>  			nr_scanned, nr_reclaimed,
>  			sc->priority, file);
> +
> +out:
> +	wake_up(&pgdat->isolated_wait);

Isolation can happen migrate/khugepaged as well as reclaim so it would
sleep forever unless all of places can isolate pages don't wake up.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2016-07-28  1:25 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-07-27  7:29 [PATCH 1/2] mm: add skipped count and more lru information to trace Minchan Kim
2016-07-27  7:29 ` [PATCH 2/2] mm: get_scan_count consider reclaimable lru pages Minchan Kim
2016-07-27 14:22   ` Mel Gorman
2016-07-27 15:13     ` Mel Gorman
2016-07-28  1:26       ` Minchan Kim
2016-07-28  1:18     ` Minchan Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).