From: "Huang, Ying" <ying.huang@intel.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
Huang Ying <ying.huang@intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
Michal Hocko <mhocko@suse.com>, Rik van Riel <riel@redhat.com>,
Mel Gorman <mgorman@suse.de>, Ingo Molnar <mingo@kernel.org>,
Dave Hansen <dave.hansen@linux.intel.com>,
Dan Williams <dan.j.williams@intel.com>,
Fengguang Wu <fengguang.wu@intel.com>
Subject: [RFC 04/10] autonuma, memory tiering: Rate limit NUMA migration throughput
Date: Fri, 1 Nov 2019 15:57:21 +0800 [thread overview]
Message-ID: <20191101075727.26683-5-ying.huang@intel.com> (raw)
In-Reply-To: <20191101075727.26683-1-ying.huang@intel.com>
From: Huang Ying <ying.huang@intel.com>
In autonuma memory tiering mode, the hot PMEM (persistent memory)
pages could be migrated to DRAM via autonuma. But this incurs some
overhead too. So that sometimes the workload performance may be hurt.
To avoid too much disturbing to the workload, the migration throughput
should be rate-limited.
At the other hand, in some situation, for example, some workloads
exits, many DRAM pages become free, so that some pages of the other
workloads can be migrated to DRAM. To respond to the workloads
changing quickly, it's better to migrate pages faster.
To address the above 2 requirements, a rate limit algorithm as follows
is used,
- If there is enough free memory in DRAM node (that is, > high
watermark + 2 * rate limit pages), then NUMA migration throughput will
not be rate-limited to respond to the workload changing quickly.
- Otherwise, counting the number of pages to try to migrate to a DRAM
node via autonuma, if the count exceeds the limit specified by the
users, stop NUMA migration until the next second.
A new sysctl knob kernel.numa_balancing_rate_limit_mbps is added for
the users to specify the limit. If its value is 0, the default
value (high watermark) will be used.
TODO: Add ABI document for new sysctl knob.
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
---
include/linux/mmzone.h | 7 ++++
include/linux/sched/sysctl.h | 1 +
kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++
kernel/sysctl.c | 8 +++++
mm/vmstat.c | 3 ++
5 files changed, 82 insertions(+)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d0b6ececaf39..46382b058546 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -247,6 +247,9 @@ enum node_stat_item {
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
+#ifdef CONFIG_NUMA_BALANCING
+ NUMA_TRY_MIGRATE, /* pages to try to migrate via NUMA balancing */
+#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -766,6 +769,10 @@ typedef struct pglist_data {
unsigned long split_queue_len;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long autonuma_jiffies;
+ unsigned long autonuma_try_migrate;
+#endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5cfe38783c60..e3616889a91c 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -42,6 +42,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_rate_limit;
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..489e2e21bb5d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1047,6 +1047,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/*
+ * Restrict the NUMA migration per second in MB for each target node
+ * if no enough free space in target node
+ */
+unsigned int sysctl_numa_balancing_rate_limit;
+
struct numa_group {
refcount_t refcount;
@@ -1397,6 +1403,44 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long rate_limit;
+
+ rate_limit = sysctl_numa_balancing_rate_limit << (20 - PAGE_SHIFT);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) + rate_limit * 2,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+static bool numa_migration_check_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long try_migrate;
+ unsigned long now = jiffies, last_jiffies;
+
+ mod_node_page_state(pgdat, NUMA_TRY_MIGRATE, nr);
+ try_migrate = node_page_state(pgdat, NUMA_TRY_MIGRATE);
+ last_jiffies = pgdat->autonuma_jiffies;
+ if (now > last_jiffies + HZ &&
+ cmpxchg(&pgdat->autonuma_jiffies, last_jiffies, now) ==
+ last_jiffies)
+ pgdat->autonuma_try_migrate = try_migrate;
+ if (try_migrate - pgdat->autonuma_try_migrate > rate_limit)
+ return false;
+ return true;
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1404,6 +1448,25 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * If memory tiering mode is enabled, will try promote pages
+ * in slow memory node to fast memory node.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ next_promotion_node(src_nid) != -1) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat))
+ return true;
+
+ rate_limit = sysctl_numa_balancing_rate_limit <<
+ (20 - PAGE_SHIFT);
+ return numa_migration_check_rate_limit(pgdat, rate_limit,
+ hpage_nr_pages(page));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 442acedb1fe7..c455ff404436 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -421,6 +421,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
+ {
+ .procname = "numa_balancing_rate_limit_mbps",
+ .data = &sysctl_numa_balancing_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
{
.procname = "numa_balancing",
.data = &sysctl_numa_balancing_mode,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d4bbf9d69c06..c7ec14673ba8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1198,6 +1198,9 @@ const char * const vmstat_text[] = {
"nr_dirtied",
"nr_written",
"nr_kernel_misc_reclaimable",
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_try_migrate",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
--
2.23.0
next prev parent reply other threads:[~2019-11-01 7:58 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-11-01 7:57 [RFC 00/10] autonuma: Optimize memory placement in memory tiering system Huang, Ying
2019-11-01 7:57 ` [RFC 01/10] autonuma: Fix watermark checking in migrate_balanced_pgdat() Huang, Ying
2019-11-01 11:11 ` Mel Gorman
2019-11-01 7:57 ` [RFC 02/10] autonuma: Reduce cache footprint when scanning page tables Huang, Ying
2019-11-01 11:13 ` Mel Gorman
2019-11-01 7:57 ` [RFC 03/10] autonuma: Add NUMA_BALANCING_MEMORY_TIERING mode Huang, Ying
2019-11-01 7:57 ` Huang, Ying [this message]
2019-11-01 7:57 ` [RFC 05/10] autonuma, memory tiering: Use kswapd to demote cold pages to PMEM Huang, Ying
2019-11-01 7:57 ` [RFC 06/10] autonuma, memory tiering: Skip to scan fastest memory Huang, Ying
2019-11-01 7:57 ` [RFC 07/10] autonuma, memory tiering: Only promote page if accessed twice Huang, Ying
2019-11-01 7:57 ` [RFC 08/10] autonuma, memory tiering: Select hotter pages to promote to fast memory node Huang, Ying
2019-11-01 9:24 ` Peter Zijlstra
2019-11-04 2:41 ` Huang, Ying
2019-11-04 8:44 ` Peter Zijlstra
2019-11-04 10:13 ` Huang, Ying
2019-11-01 7:57 ` [RFC 09/10] autonuma, memory tiering: Double hot threshold for write hint page fault Huang, Ying
2019-11-01 7:57 ` [RFC 10/10] autonuma, memory tiering: Adjust hot threshold automatically Huang, Ying
2019-11-01 9:31 ` Peter Zijlstra
2019-11-04 6:11 ` Huang, Ying
2019-11-04 8:49 ` Peter Zijlstra
2019-11-04 10:12 ` Huang, Ying
2019-11-21 8:38 ` Huang, Ying
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20191101075727.26683-5-ying.huang@intel.com \
--to=ying.huang@intel.com \
--cc=akpm@linux-foundation.org \
--cc=dan.j.williams@intel.com \
--cc=dave.hansen@linux.intel.com \
--cc=fengguang.wu@intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mgorman@suse.de \
--cc=mhocko@suse.com \
--cc=mingo@kernel.org \
--cc=peterz@infradead.org \
--cc=riel@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).