linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Mel Gorman <mgorman@suse.de>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Linux-MM <linux-mm@kvack.org>,
	Linux-Netdev <netdev@vger.kernel.org>,
	LKML <linux-kernel@vger.kernel.org>,
	David Miller <davem@davemloft.net>, Neil Brown <neilb@suse.de>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Mike Christie <michaelc@cs.wisc.edu>,
	Eric B Munson <emunson@mgebm.net>, Mel Gorman <mgorman@suse.de>
Subject: [PATCH 16/17] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
Date: Thu, 10 May 2012 14:45:09 +0100	[thread overview]
Message-ID: <1336657510-24378-17-git-send-email-mgorman@suse.de> (raw)
In-Reply-To: <1336657510-24378-1-git-send-email-mgorman@suse.de>

If swap is backed by network storage such as NBD, there is a risk
that a large number of reclaimers can hang the system by consuming
all PF_MEMALLOC reserves. To avoid these hangs, the administrator
must tune min_free_kbytes in advance which is a bit fragile.

This patch throttles direct reclaimers if half the PF_MEMALLOC reserves
are in use. If the system is routinely getting throttled the system
administrator can increase min_free_kbytes so degradation is smoother
but the system will keep running.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 include/linux/mmzone.h |    1 +
 mm/page_alloc.c        |    1 +
 mm/vmscan.c            |  128 +++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 122 insertions(+), 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dff7115..e6b733d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -663,6 +663,7 @@ typedef struct pglist_data {
 					     range, including holes */
 	int node_id;
 	wait_queue_head_t kswapd_wait;
+	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e225a7c..b9eb64a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4326,6 +4326,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
+	init_waitqueue_head(&pgdat->pfmemalloc_wait);
 	pgdat->kswapd_max_order = 0;
 	pgdat_page_cgroup_init(pgdat);
 	
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33dc256..97c766f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2431,6 +2431,80 @@ out:
 	return 0;
 }
 
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+	struct zone *zone;
+	unsigned long pfmemalloc_reserve = 0;
+	unsigned long free_pages = 0;
+	int i;
+	bool wmark_ok;
+
+	for (i = 0; i <= ZONE_NORMAL; i++) {
+		zone = &pgdat->node_zones[i];
+		pfmemalloc_reserve += min_wmark_pages(zone);
+		free_pages += zone_page_state(zone, NR_FREE_PAGES);
+	}
+
+	wmark_ok = free_pages > pfmemalloc_reserve / 2;
+
+	/* kswapd must be awake if processes are being throttled */
+	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+		pgdat->classzone_idx = min(pgdat->classzone_idx,
+						(enum zone_type)ZONE_NORMAL);
+		wake_up_interruptible(&pgdat->kswapd_wait);
+	}
+
+	return wmark_ok;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+					nodemask_t *nodemask)
+{
+	struct zone *zone;
+	int high_zoneidx = gfp_zone(gfp_mask);
+	pg_data_t *pgdat;
+
+	/*
+	 * Kernel threads should not be throttled as they may be indirectly
+	 * responsible for cleaning pages necessary for reclaim to make forward
+	 * progress. kjournald for example may enter direct reclaim while
+	 * committing a transaction where throttling it could forcing other
+	 * processes to block on log_wait_commit().
+	 */
+	if (current->flags & PF_KTHREAD)
+		return;
+
+	/* Check if the pfmemalloc reserves are ok */
+	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+	pgdat = zone->zone_pgdat;
+	if (pfmemalloc_watermark_ok(pgdat))
+		return;
+
+	/*
+	 * If the caller cannot enter the filesystem, it's possible that it
+	 * is due to the caller holding an FS lock or performing a journal
+	 * transaction in the case of a filesystem like ext[3|4]. In this case,
+	 * it is not safe to block on pfmemalloc_wait as kswapd could be
+	 * blocked waiting on the same lock. Instead, throttle for up to a
+	 * second before continuing.
+	 */
+	if (!(gfp_mask & __GFP_FS)) {
+		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+			pfmemalloc_watermark_ok(pgdat), HZ);
+		return;
+	}
+
+	/* Throttle until kswapd wakes the process */
+	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+		pfmemalloc_watermark_ok(pgdat));
+}
+
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2449,6 +2523,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.gfp_mask = sc.gfp_mask,
 	};
 
+	throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+
+	/*
+	 * Do not enter reclaim if fatal signal is pending. 1 is returned so
+	 * that the page allocator does not consider triggering OOM
+	 */
+	if (fatal_signal_pending(current))
+		return 1;
+
 	trace_mm_vmscan_direct_reclaim_begin(order,
 				sc.may_writepage,
 				gfp_mask);
@@ -2598,8 +2681,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
 	return balanced_pages >= (present_pages >> 2);
 }
 
-/* is kswapd sleeping prematurely? */
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+/*
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 					int classzone_idx)
 {
 	int i;
@@ -2608,7 +2696,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
 
 	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
 	if (remaining)
-		return true;
+		return false;
+
+	/*
+	 * There is a potential race between when kswapd checks its watermarks
+	 * and a process gets throttled. There is also a potential race if
+	 * processes get throttled, kswapd wakes, a large process exits therby
+	 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+	 * is going to sleep, no process should be sleeping on pfmemalloc_wait
+	 * so wake them now if necessary. If necessary, processes will wake
+	 * kswapd and get throttled again
+	 */
+	if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+		wake_up(&pgdat->pfmemalloc_wait);
+		return false;
+	}
 
 	/* Check the watermark levels */
 	for (i = 0; i <= classzone_idx; i++) {
@@ -2641,9 +2743,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
 	 * must be balanced
 	 */
 	if (order)
-		return !pgdat_balanced(pgdat, balanced, classzone_idx);
+		return pgdat_balanced(pgdat, balanced, classzone_idx);
 	else
-		return !all_zones_ok;
+		return all_zones_ok;
 }
 
 /*
@@ -2871,6 +2973,16 @@ loop_again:
 			}
 
 		}
+
+		/*
+		 * If the low watermark is met there is no need for processes
+		 * to be throttled on pfmemalloc_wait as they should not be
+		 * able to safely make forward progress. Wake them
+		 */
+		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+				pfmemalloc_watermark_ok(pgdat))
+			wake_up(&pgdat->pfmemalloc_wait);
+
 		if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
 			break;		/* kswapd: all done */
 		/*
@@ -2971,7 +3083,7 @@ out:
 	}
 
 	/*
-	 * Return the order we were reclaiming at so sleeping_prematurely()
+	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
 	 * makes a decision on the order we were last reclaiming at. However,
 	 * if another caller entered the allocator slow path while kswapd
 	 * was awake, order will remain at the higher level
@@ -2991,7 +3103,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 
 	/* Try to sleep for a short interval */
-	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 		remaining = schedule_timeout(HZ/10);
 		finish_wait(&pgdat->kswapd_wait, &wait);
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3001,7 +3113,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 	 * After a short sleep, check if it was a premature sleep. If not, then
 	 * go fully to sleep until explicitly woken up.
 	 */
-	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
 		/*
-- 
1.7.9.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2012-05-10 13:45 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-10 13:44 [PATCH 00/17] Swap-over-NBD without deadlocking V10 Mel Gorman
2012-05-10 13:44 ` [PATCH 01/17] mm: Serialize access to min_free_kbytes Mel Gorman
2012-05-10 13:44 ` [PATCH 02/17] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages Mel Gorman
2012-05-10 13:44 ` [PATCH 03/17] mm: slub: Optimise the SLUB fast path to avoid pfmemalloc checks Mel Gorman
2012-05-10 13:44 ` [PATCH 04/17] mm: Introduce __GFP_MEMALLOC to allow access to emergency reserves Mel Gorman
2012-05-10 13:44 ` [PATCH 05/17] mm: allow PF_MEMALLOC from softirq context Mel Gorman
2012-05-11  4:39   ` David Miller
2012-05-14 10:02     ` Mel Gorman
2012-05-15 13:07       ` Mel Gorman
2012-05-10 13:44 ` [PATCH 06/17] mm: Only set page->pfmemalloc when ALLOC_NO_WATERMARKS was used Mel Gorman
2012-05-10 13:45 ` [PATCH 07/17] mm: Ignore mempolicies when using ALLOC_NO_WATERMARK Mel Gorman
2012-05-10 13:45 ` [PATCH 08/17] net: Introduce sk_allocation() to allow addition of GFP flags depending on the individual socket Mel Gorman
2012-05-11  4:49   ` David Miller
2012-05-11 14:12     ` Mel Gorman
2012-05-10 13:45 ` [PATCH 09/17] netvm: Allow the use of __GFP_MEMALLOC by specific sockets Mel Gorman
2012-05-11  4:50   ` David Miller
2012-05-10 13:45 ` [PATCH 10/17] netvm: Allow skb allocation to use PFMEMALLOC reserves Mel Gorman
2012-05-11  4:57   ` David Miller
2012-05-11 14:32     ` Mel Gorman
2012-05-11 14:42       ` Peter Zijlstra
2012-05-11 15:07         ` Mel Gorman
2012-05-11 21:17       ` David Miller
2012-05-10 13:45 ` [PATCH 11/17] netvm: Propagate page->pfmemalloc to skb Mel Gorman
2012-05-10 13:45 ` [PATCH 12/17] netvm: Propagate page->pfmemalloc from netdev_alloc_page " Mel Gorman
2012-05-11  5:01   ` David Miller
2012-05-11 14:46     ` Mel Gorman
2012-05-10 13:45 ` [PATCH 13/17] netvm: Set PF_MEMALLOC as appropriate during SKB processing Mel Gorman
2012-05-11  5:03   ` David Miller
2012-05-10 13:45 ` [PATCH 14/17] mm: Micro-optimise slab to avoid a function call Mel Gorman
2012-05-10 13:45 ` [PATCH 15/17] nbd: Set SOCK_MEMALLOC for access to PFMEMALLOC reserves Mel Gorman
2012-05-10 13:45 ` Mel Gorman [this message]
2012-05-10 13:45 ` [PATCH 17/17] mm: Account for the number of times direct reclaimers get throttled Mel Gorman
2012-05-10 17:17 ` [PATCH 00/17] Swap-over-NBD without deadlocking V10 Mike Christie
2012-05-11  5:04 ` David Miller
2012-05-11 15:45   ` Mel Gorman
2012-05-11 21:23     ` David Miller
2012-05-11 21:29       ` Andrew Morton
2012-05-14 11:10         ` Mel Gorman
  -- strict thread matches above, loose matches on Subject: below --
2012-05-17 14:50 [PATCH 00/17] Swap-over-NBD without deadlocking V11 Mel Gorman
2012-05-17 14:50 ` [PATCH 16/17] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage Mel Gorman
2012-06-20 11:43 [PATCH 00/17] Swap-over-NBD without deadlocking V12 (resend) Mel Gorman
2012-06-20 11:44 ` [PATCH 16/17] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1336657510-24378-17-git-send-email-mgorman@suse.de \
    --to=mgorman@suse.de \
    --cc=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=davem@davemloft.net \
    --cc=emunson@mgebm.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=michaelc@cs.wisc.edu \
    --cc=neilb@suse.de \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).