linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
To: akpm@linux-foundation.org, mgorman@suse.de, dave@sr71.net,
	hannes@cmpxchg.org, tony.luck@intel.com,
	matthew.garrett@nebula.com, riel@redhat.com,
	arjan@linux.intel.com, srinivas.pandruvada@linux.intel.com,
	willy@linux.intel.com, kamezawa.hiroyu@jp.fujitsu.com,
	lenb@kernel.org, rjw@sisk.pl
Cc: gargankita@gmail.com, paulmck@linux.vnet.ibm.com,
	svaidy@linux.vnet.ibm.com, andi@firstfloor.org,
	isimatu.yasuaki@jp.fujitsu.com, santosh.shilimkar@ti.com,
	kosaki.motohiro@gmail.com, srivatsa.bhat@linux.vnet.ibm.com,
	linux-pm@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v4 37/40] mm: Add a kthread to perform targeted compaction for memory power management
Date: Thu, 26 Sep 2013 04:52:10 +0530	[thread overview]
Message-ID: <20130925232208.26184.58122.stgit@srivatsabhat.in.ibm.com> (raw)
In-Reply-To: <20130925231250.26184.31438.stgit@srivatsabhat.in.ibm.com>

To further increase the opportunities for memory power savings, we can perform
targeted compaction to evacuate lightly-filled memory regions. For this
purpose, introduce a dedicated per-node kthread to perform the targeted
compaction work.

Our "kmempowerd" kthread uses the generic kthread-worker framework to do most
of the usual work all kthreads need to do. On top of that, this kthread has the
following infrastructure in place, to perform the region evacuation.

A work item is instantiated for every zone. Accessible to this work item is a
spin-lock protected bitmask, which helps us indicate which regions have to be
evacuated. The bits set in the bitmask represent the zone-memory-region number
within that zone that would benefit from evacuation.

The operation of the "kmempowerd" kthread is quite straight-forward: it makes a
local copy of the bitmask (which represents the work it is supposed to do), and
performs targeted region evacuation for each of the regions represented in
that bitmask. When its done, it updates the original bitmask by clearing those
bits, to indicate that the requested work was completed. While the kthread is
going about doing its duty, the original bitmask can be updated to indicate the
arrival of more work. So once the kthread finishes one round of processing, it
re-examines the original bitmask to see if any new work had arrived in the
meantime, and does the corresponding work if required. This process continues
until the original bitmask becomes empty (no bits set, so no more work to do).

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/mmzone.h |   10 ++++++
 mm/compaction.c        |   80 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 49c8926..257afdf 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -10,6 +10,7 @@
 #include <linux/bitops.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
+#include <linux/kthread-work.h>
 #include <linux/numa.h>
 #include <linux/init.h>
 #include <linux/seqlock.h>
@@ -128,6 +129,13 @@ struct region_allocator {
 	DECLARE_BITMAP(ralloc_mask, MAX_NR_ZONE_REGIONS);
 };
 
+struct mempower_work {
+	spinlock_t		lock;
+	DECLARE_BITMAP(mempower_mask, MAX_NR_ZONE_REGIONS);
+
+	struct kthread_work	work;
+};
+
 struct pglist_data;
 
 /*
@@ -460,6 +468,7 @@ struct zone {
 	 */
 	unsigned int inactive_ratio;
 
+	struct mempower_work	mempower_work;
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
@@ -830,6 +839,7 @@ typedef struct pglist_data {
 	struct task_struct *kswapd;	/* Protected by lock_memory_hotplug() */
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
+	struct kthread_worker mempower_worker;
 #ifdef CONFIG_NUMA_BALANCING
 	/*
 	 * Lock serializing the per destination node AutoNUMA memory
diff --git a/mm/compaction.c b/mm/compaction.c
index 9449b7f..0511eae 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
+#include <linux/kthread.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -1267,6 +1268,85 @@ int evacuate_mem_region(struct zone *z, struct zone_mem_region *zmr)
 	return compact_range(&cc, &ac, &fc, start_pfn, end_pfn);
 }
 
+#define nr_zone_region_bits	MAX_NR_ZONE_REGIONS
+static DECLARE_BITMAP(mpwork_mask, nr_zone_region_bits);
+
+static void kmempowerd(struct kthread_work *work)
+{
+	struct mempower_work *mpwork;
+	struct zone *zone;
+	unsigned long flags;
+	int region_id;
+
+	mpwork = container_of(work, struct mempower_work, work);
+	zone = container_of(mpwork, struct zone, mempower_work);
+
+	spin_lock_irqsave(&mpwork->lock, flags);
+repeat:
+	bitmap_copy(mpwork_mask, mpwork->mempower_mask, nr_zone_region_bits);
+	spin_unlock_irqrestore(&mpwork->lock, flags);
+
+	if (bitmap_empty(mpwork_mask, nr_zone_region_bits))
+		return;
+
+	for_each_set_bit(region_id, mpwork_mask, nr_zone_region_bits)
+		evacuate_mem_region(zone, &zone->zone_regions[region_id]);
+
+	spin_lock_irqsave(&mpwork->lock, flags);
+
+	bitmap_andnot(mpwork->mempower_mask, mpwork->mempower_mask, mpwork_mask,
+		      nr_zone_region_bits);
+	if (!bitmap_empty(mpwork->mempower_mask, nr_zone_region_bits))
+		goto repeat; /* More work got added in the meanwhile */
+
+	spin_unlock_irqrestore(&mpwork->lock, flags);
+
+}
+
+static void kmempowerd_run(int nid)
+{
+	struct kthread_worker *worker;
+	struct mempower_work *mpwork;
+	struct pglist_data *pgdat;
+	struct task_struct *task;
+	unsigned long flags;
+	int i;
+
+	pgdat = NODE_DATA(nid);
+	worker = &pgdat->mempower_worker;
+
+	init_kthread_worker(worker);
+
+	task = kthread_create_on_node(kthread_worker_fn, worker, nid,
+				      "kmempowerd/%d", nid);
+	if (IS_ERR(task))
+		return;
+
+	for (i = 0; i < pgdat->nr_zones; i++) {
+		mpwork = &pgdat->node_zones[i].mempower_work;
+		init_kthread_work(&mpwork->work, kmempowerd);
+
+		spin_lock_init(&mpwork->lock);
+
+		/* Initialize bitmap to zero to indicate no-pending-work */
+		spin_lock_irqsave(&mpwork->lock, flags);
+		bitmap_zero(mpwork->mempower_mask, nr_zone_region_bits);
+		spin_unlock_irqrestore(&mpwork->lock, flags);
+	}
+
+	wake_up_process(task);
+}
+
+int kmempowerd_init(void)
+{
+	int nid;
+
+	for_each_node_state(nid, N_MEMORY)
+		kmempowerd_run(nid);
+
+	return 0;
+}
+module_init(kmempowerd_init);
 
 /* Compact all zones within a node */
 static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2013-09-25 23:26 UTC|newest]

Thread overview: 77+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-09-25 23:13 [RFC PATCH v4 00/40] mm: Memory Power Management Srivatsa S. Bhat
2013-09-25 23:13 ` [RFC PATCH v4 01/40] mm: Introduce memory regions data-structure to capture region boundaries within nodes Srivatsa S. Bhat
2013-10-23  9:54   ` Johannes Weiner
2013-10-23 14:38     ` Srivatsa S. Bhat
2013-09-25 23:14 ` [RFC PATCH v4 02/40] mm: Initialize node memory regions during boot Srivatsa S. Bhat
2013-09-25 23:14 ` [RFC PATCH v4 03/40] mm: Introduce and initialize zone memory regions Srivatsa S. Bhat
2013-09-25 23:14 ` [RFC PATCH v4 04/40] mm: Add helpers to retrieve node region and zone region for a given page Srivatsa S. Bhat
2013-09-25 23:14 ` [RFC PATCH v4 05/40] mm: Add data-structures to describe memory regions within the zones' freelists Srivatsa S. Bhat
2013-09-25 23:14 ` [RFC PATCH v4 06/40] mm: Demarcate and maintain pageblocks in region-order in " Srivatsa S. Bhat
2013-09-26 22:16   ` Dave Hansen
2013-09-27  6:34     ` Srivatsa S. Bhat
2013-10-23 10:17   ` Johannes Weiner
2013-10-23 16:09     ` Srivatsa S. Bhat
2013-09-25 23:15 ` [RFC PATCH v4 07/40] mm: Track the freepage migratetype of pages accurately Srivatsa S. Bhat
2013-09-25 23:15 ` [RFC PATCH v4 08/40] mm: Use the correct migratetype during buddy merging Srivatsa S. Bhat
2013-09-25 23:15 ` [RFC PATCH v4 09/40] mm: Add an optimized version of del_from_freelist to keep page allocation fast Srivatsa S. Bhat
2013-09-25 23:15 ` [RFC PATCH v4 10/40] bitops: Document the difference in indexing between fls() and __fls() Srivatsa S. Bhat
2013-09-25 23:16 ` [RFC PATCH v4 11/40] mm: A new optimized O(log n) sorting algo to speed up buddy-sorting Srivatsa S. Bhat
2013-09-25 23:16 ` [RFC PATCH v4 12/40] mm: Add support to accurately track per-memory-region allocation Srivatsa S. Bhat
2013-09-25 23:16 ` [RFC PATCH v4 13/40] mm: Print memory region statistics to understand the buddy allocator behavior Srivatsa S. Bhat
2013-09-25 23:17 ` [RFC PATCH v4 14/40] mm: Enable per-memory-region fragmentation stats in pagetypeinfo Srivatsa S. Bhat
2013-09-25 23:17 ` [RFC PATCH v4 15/40] mm: Add aggressive bias to prefer lower regions during page allocation Srivatsa S. Bhat
2013-09-25 23:17 ` [RFC PATCH v4 16/40] mm: Introduce a "Region Allocator" to manage entire memory regions Srivatsa S. Bhat
2013-10-23 10:10   ` Johannes Weiner
2013-10-23 16:22     ` Srivatsa S. Bhat
2013-09-25 23:17 ` [RFC PATCH v4 17/40] mm: Add a mechanism to add pages to buddy freelists in bulk Srivatsa S. Bhat
2013-09-25 23:18 ` [RFC PATCH v4 18/40] mm: Provide a mechanism to delete pages from " Srivatsa S. Bhat
2013-09-25 23:18 ` [RFC PATCH v4 19/40] mm: Provide a mechanism to release free memory to the region allocator Srivatsa S. Bhat
2013-09-25 23:18 ` [RFC PATCH v4 20/40] mm: Provide a mechanism to request free memory from " Srivatsa S. Bhat
2013-09-25 23:18 ` [RFC PATCH v4 21/40] mm: Maintain the counter for freepages in " Srivatsa S. Bhat
2013-09-25 23:18 ` [RFC PATCH v4 22/40] mm: Propagate the sorted-buddy bias for picking free regions, to " Srivatsa S. Bhat
2013-09-25 23:19 ` [RFC PATCH v4 23/40] mm: Fix vmstat to also account for freepages in the " Srivatsa S. Bhat
2013-09-25 23:19 ` [RFC PATCH v4 24/40] mm: Drop some very expensive sorted-buddy related checks under DEBUG_PAGEALLOC Srivatsa S. Bhat
2013-09-25 23:19 ` [RFC PATCH v4 25/40] mm: Connect Page Allocator(PA) to Region Allocator(RA); add PA => RA flow Srivatsa S. Bhat
2013-09-25 23:19 ` [RFC PATCH v4 26/40] mm: Connect Page Allocator(PA) to Region Allocator(RA); add PA <= " Srivatsa S. Bhat
2013-09-25 23:19 ` [RFC PATCH v4 27/40] mm: Update the freepage migratetype of pages during region allocation Srivatsa S. Bhat
2013-09-25 23:20 ` [RFC PATCH v4 28/40] mm: Provide a mechanism to check if a given page is in the region allocator Srivatsa S. Bhat
2013-09-25 23:20 ` [RFC PATCH v4 29/40] mm: Add a way to request pages of a particular region from " Srivatsa S. Bhat
2013-09-25 23:20 ` [RFC PATCH v4 30/40] mm: Modify move_freepages() to handle pages in the region allocator properly Srivatsa S. Bhat
2013-09-25 23:20 ` [RFC PATCH v4 31/40] mm: Never change migratetypes of pageblocks during freepage stealing Srivatsa S. Bhat
2013-09-25 23:20 ` [RFC PATCH v4 32/40] mm: Set pageblock migratetype when allocating regions from region allocator Srivatsa S. Bhat
2013-09-25 23:21 ` [RFC PATCH v4 33/40] mm: Use a cache between page-allocator and region-allocator Srivatsa S. Bhat
2013-09-25 23:21 ` [RFC PATCH v4 34/40] mm: Restructure the compaction part of CMA for wider use Srivatsa S. Bhat
2013-09-25 23:21 ` [RFC PATCH v4 35/40] mm: Add infrastructure to evacuate memory regions using compaction Srivatsa S. Bhat
2013-09-25 23:21 ` [RFC PATCH v4 36/40] kthread: Split out kthread-worker bits to avoid circular header-file dependency Srivatsa S. Bhat
2013-09-25 23:22 ` Srivatsa S. Bhat [this message]
2013-09-25 23:22 ` [RFC PATCH v4 38/40] mm: Add a mechanism to queue work to the kmempowerd kthread Srivatsa S. Bhat
2013-09-25 23:22 ` [RFC PATCH v4 39/40] mm: Add intelligence in kmempowerd to ignore regions unsuitable for evacuation Srivatsa S. Bhat
2013-09-25 23:22 ` [RFC PATCH v4 40/40] mm: Add triggers in the page-allocator to kick off region evacuation Srivatsa S. Bhat
2013-09-25 23:26 ` [Results] [RFC PATCH v4 00/40] mm: Memory Power Management Srivatsa S. Bhat
2013-09-25 23:40   ` Andrew Morton
2013-09-25 23:47     ` Andi Kleen
2013-09-26  1:14       ` Arjan van de Ven
2013-09-26 13:09         ` Srivatsa S. Bhat
2013-09-26  1:15       ` Arjan van de Ven
2013-09-26  1:21         ` Andrew Morton
2013-09-26  1:50           ` Andi Kleen
2013-09-26  2:59             ` Andrew Morton
2013-09-26 13:42               ` Srivatsa S. Bhat
2013-09-26 15:58                 ` Arjan van de Ven
2013-09-26 17:00                   ` Srivatsa S. Bhat
2013-09-26 18:06                     ` Arjan van de Ven
2013-09-26 18:33                       ` Srivatsa S. Bhat
2013-09-26 18:50                         ` Luck, Tony
2013-09-26 18:56                           ` Srivatsa S. Bhat
2013-09-26 13:37             ` Srivatsa S. Bhat
2013-09-26 15:23           ` Arjan van de Ven
2013-09-26 13:16         ` Srivatsa S. Bhat
2013-09-26 12:58     ` Srivatsa S. Bhat
2013-09-26 15:29       ` Arjan van de Ven
2013-09-26 17:22       ` Luck, Tony
2013-09-26 17:54         ` Srivatsa S. Bhat
2013-09-26 19:38         ` Andi Kleen
2013-11-12  8:02       ` Srivatsa S. Bhat
2013-11-12 17:34         ` Dave Hansen
2013-11-12 18:44           ` Srivatsa S. Bhat
2013-11-12 18:49         ` Srivatsa S. Bhat

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130925232208.26184.58122.stgit@srivatsabhat.in.ibm.com \
    --to=srivatsa.bhat@linux.vnet.ibm.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=arjan@linux.intel.com \
    --cc=dave@sr71.net \
    --cc=gargankita@gmail.com \
    --cc=hannes@cmpxchg.org \
    --cc=isimatu.yasuaki@jp.fujitsu.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@gmail.com \
    --cc=lenb@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=matthew.garrett@nebula.com \
    --cc=mgorman@suse.de \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=riel@redhat.com \
    --cc=rjw@sisk.pl \
    --cc=santosh.shilimkar@ti.com \
    --cc=srinivas.pandruvada@linux.intel.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=tony.luck@intel.com \
    --cc=willy@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).