From: Cody P Schafer <cody@linux.vnet.ibm.com>
To: Linux MM <linux-mm@kvack.org>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>,
David Hansen <dave@linux.vnet.ibm.com>
Subject: [PATCH 15/24] mm: add memlayout & dnuma to track pfn->nid & transplant pages between nodes
Date: Thu, 28 Feb 2013 13:26:12 -0800 [thread overview]
Message-ID: <1362086781-16725-6-git-send-email-cody@linux.vnet.ibm.com> (raw)
In-Reply-To: <1362086781-16725-1-git-send-email-cody@linux.vnet.ibm.com>
In-Reply-To: <1362084272-11282-1-git-send-email-cody@linux.vnet.ibm.com>
On certain systems, the hypervisor can (and will) relocate physical
addresses as seen in a VM between real NUMA nodes. For example, IBM's
Power systems which are using PHYP (their proprietary hypervisor).
This change set introduces the infrastructure for tracking & dynamically
changing "memory layouts" (or "memlayouts"): the mapping between page
ranges & the actual backing NUMA node.
A memlayout is an rbtree which maps pfns (really, ranges of pfns) to a
node. This mapping (combined with the LookupNode pageflag) is used to
"transplant" (move pages between nodes) pages when they are freed back
to the page allocator.
Additionally, when a new memlayout is commited the currently free pages
that are now in the wrong zone's freelist are immidiately transplanted.
Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
---
include/linux/dnuma.h | 96 +++++++++++++
include/linux/memlayout.h | 110 +++++++++++++++
mm/Kconfig | 19 +++
mm/Makefile | 1 +
mm/dnuma.c | 349 ++++++++++++++++++++++++++++++++++++++++++++++
mm/memlayout.c | 238 +++++++++++++++++++++++++++++++
6 files changed, 813 insertions(+)
create mode 100644 include/linux/dnuma.h
create mode 100644 include/linux/memlayout.h
create mode 100644 mm/dnuma.c
create mode 100644 mm/memlayout.c
diff --git a/include/linux/dnuma.h b/include/linux/dnuma.h
new file mode 100644
index 0000000..8f5cbf9
--- /dev/null
+++ b/include/linux/dnuma.h
@@ -0,0 +1,96 @@
+#ifndef LINUX_DNUMA_H_
+#define LINUX_DNUMA_H_
+
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/memlayout.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+
+#ifdef CONFIG_DYNAMIC_NUMA
+/* Must be called _before_ setting a new_ml to the pfn_to_node_map */
+void dnuma_online_required_nodes_and_zones(struct memlayout *new_ml);
+
+/* Must be called _after_ setting a new_ml to the pfn_to_node_map */
+void dnuma_move_free_pages(struct memlayout *new_ml);
+void dnuma_mark_page_range(struct memlayout *new_ml);
+
+static inline bool dnuma_is_active(void)
+{
+ struct memlayout *ml;
+ bool ret;
+
+ rcu_read_lock();
+ ml = rcu_dereference(pfn_to_node_map);
+ ret = ml && (ml->type != ML_INITIAL);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static inline bool dnuma_has_memlayout(void)
+{
+ return !!rcu_access_pointer(pfn_to_node_map);
+}
+
+static inline int dnuma_page_needs_move(struct page *page)
+{
+ int new_nid, old_nid;
+
+ if (!TestClearPageLookupNode(page))
+ return NUMA_NO_NODE;
+
+ /* FIXME: this does rcu_lock, deref, unlock */
+ if (WARN_ON(!dnuma_is_active()))
+ return NUMA_NO_NODE;
+
+ /* FIXME: and so does this (rcu lock, deref, and unlock) */
+ new_nid = memlayout_pfn_to_nid(page_to_pfn(page));
+ old_nid = page_to_nid(page);
+
+ if (new_nid == NUMA_NO_NODE) {
+ pr_alert("dnuma: pfn %05lx has moved from node %d to a non-memlayout range.\n",
+ page_to_pfn(page), old_nid);
+ return NUMA_NO_NODE;
+ }
+
+ if (new_nid == old_nid)
+ return NUMA_NO_NODE;
+
+ if (WARN_ON(!zone_is_initialized(nid_zone(new_nid, page_zonenum(page)))))
+ return NUMA_NO_NODE;
+
+ return new_nid;
+}
+
+void dnuma_post_free_to_new_zone(struct page *page, int order);
+void dnuma_prior_free_to_new_zone(struct page *page, int order,
+ struct zone *dest_zone,
+ int dest_nid);
+
+#else /* !defined CONFIG_DYNAMIC_NUMA */
+
+static inline bool dnuma_is_active(void)
+{
+ return false;
+}
+
+static inline void dnuma_prior_free_to_new_zone(struct page *page, int order,
+ struct zone *dest_zone,
+ int dest_nid)
+{
+ BUG();
+}
+
+static inline void dnuma_post_free_to_new_zone(struct page *page, int order)
+{
+ BUG();
+}
+
+static inline int dnuma_page_needs_move(struct page *page)
+{
+ return NUMA_NO_NODE;
+}
+#endif /* !defined CONFIG_DYNAMIC_NUMA */
+
+#endif /* defined LINUX_DNUMA_H_ */
diff --git a/include/linux/memlayout.h b/include/linux/memlayout.h
new file mode 100644
index 0000000..eeb88e0
--- /dev/null
+++ b/include/linux/memlayout.h
@@ -0,0 +1,110 @@
+#ifndef LINUX_MEMLAYOUT_H_
+#define LINUX_MEMLAYOUT_H_
+
+#include <linux/memblock.h> /* __init_memblock */
+#include <linux/mm.h> /* NODE_DATA, page_zonenum */
+#include <linux/mmzone.h> /* pfn_to_nid */
+#include <linux/rbtree.h>
+#include <linux/types.h> /* size_t */
+
+#ifdef CONFIG_DYNAMIC_NUMA
+# ifdef NODE_NOT_IN_PAGE_FLAGS
+# error "CONFIG_DYNAMIC_NUMA requires the NODE is in page flags. Try freeing up some flags by decreasing the maximum number of NUMA nodes, or switch to sparsmem-vmemmap"
+# endif
+
+enum memlayout_type {
+ ML_INITIAL,
+ ML_DNUMA,
+ ML_NUM_TYPES
+};
+
+/*
+ * - rbtree of {node, start, end}.
+ * - assumes no 'ranges' overlap.
+ */
+struct rangemap_entry {
+ struct rb_node node;
+ unsigned long pfn_start;
+ /* @pfn_end: inclusive, not stored as a count to make the lookup
+ * faster
+ */
+ unsigned long pfn_end;
+ int nid;
+};
+
+struct memlayout {
+ struct rb_root root;
+ enum memlayout_type type;
+
+ /*
+ * When a memlayout is commited, 'cache' is accessed (the field is read
+ * from & written to) by multiple tasks without additional locking
+ * (other than the rcu locking for accessing the memlayout).
+ *
+ * Do not assume that it will not change. Use ACCESS_ONCE() to avoid
+ * potential races.
+ */
+ struct rangemap_entry *cache;
+
+#ifdef CONFIG_DNUMA_DEBUGFS
+ unsigned seq;
+ struct dentry *d;
+#endif
+};
+
+extern __rcu struct memlayout *pfn_to_node_map;
+
+/* FIXME: overflow potential in completion check */
+#define ml_for_each_pfn_in_range(rme, pfn) \
+ for (pfn = rme->pfn_start; \
+ pfn <= rme->pfn_end; \
+ pfn++)
+
+#define ml_for_each_range(ml, rme) \
+ for (rme = rb_entry(rb_first(&ml->root), typeof(*rme), node); \
+ &rme->node; \
+ rme = rb_entry(rb_next(&rme->node), typeof(*rme), node))
+
+#define rme_next(rme) rb_entry(rb_next(&rme->node), typeof(*rme), node)
+
+struct memlayout *memlayout_create(enum memlayout_type);
+void memlayout_destroy(struct memlayout *ml);
+
+/* Callers accesing the same memlayout are assumed to be serialized */
+int memlayout_new_range(struct memlayout *ml,
+ unsigned long pfn_start, unsigned long pfn_end, int nid);
+
+/* only queries the memlayout tracking structures. */
+int memlayout_pfn_to_nid(unsigned long pfn);
+
+/* Put ranges added by memlayout_new_range() into use by
+ * memlayout_pfn_get_nid() and retire old ranges.
+ *
+ * No modifications to a memlayout can be made after it is commited.
+ *
+ * sleeps via syncronize_rcu().
+ *
+ * memlayout takes ownership of ml, no futher mamlayout_new_range's should be
+ * issued
+ */
+void memlayout_commit(struct memlayout *ml);
+
+/* Sets up an inital memlayout in early boot.
+ * A weak default which uses memblock is provided.
+ */
+void memlayout_global_init(void);
+
+#else /* ! defined(CONFIG_DYNAMIC_NUMA) */
+
+/* memlayout_new_range() & memlayout_commit() are purposefully omitted */
+
+static inline void memlayout_global_init(void)
+{}
+
+static inline int memlayout_pfn_to_nid(unsigned long pfn)
+{
+ return NUMA_NO_NODE;
+}
+#endif /* !defined(CONFIG_DYNAMIC_NUMA) */
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index 2c7aea7..7209ea5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -169,6 +169,25 @@ config MOVABLE_NODE
config HAVE_BOOTMEM_INFO_NODE
def_bool n
+config DYNAMIC_NUMA
+ bool "Dynamic Numa: Allow NUMA layout to change after boot time"
+ depends on NUMA
+ depends on !DISCONTIGMEM
+ depends on MEMORY_HOTPLUG # locking + mem_online_node().
+ help
+ Dynamic Numa (DNUMA) allows the movement of pages between NUMA nodes at
+ run time.
+
+ Typically, this is used on systems running under a hypervisor which
+ may move the running VM based on the hypervisors needs. On such a
+ system, this config option enables Linux to update it's knowledge of
+ the memory layout.
+
+ If the feature is not used but is enabled, there is a small amount of overhead (an
+ additional pointer NULL check) added to all page frees.
+
+ Choose Y if you have one of these systems (XXX: which ones?), otherwise choose N.
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
diff --git a/mm/Makefile b/mm/Makefile
index 3a46287..82fe7c9b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,3 +58,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_DYNAMIC_NUMA) += dnuma.o memlayout.o
diff --git a/mm/dnuma.c b/mm/dnuma.c
new file mode 100644
index 0000000..8bc81b2
--- /dev/null
+++ b/mm/dnuma.c
@@ -0,0 +1,349 @@
+#define pr_fmt(fmt) "dnuma: " fmt
+
+#include <linux/dnuma.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/memory.h>
+
+#include "internal.h"
+
+/* Issues due to pageflag_blocks attached to zones with Discontig Mem (&
+ * Flatmem??).
+ * - Need atomicity over the combination of commiting a new memlayout and
+ * removing the pages from free lists.
+ */
+
+/* XXX: "present pages" is guarded by lock_memory_hotplug(), not the spanlock.
+ * Need to change all users. */
+void adjust_zone_present_pages(struct zone *zone, long delta)
+{
+ unsigned long flags;
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone_span_writelock(zone);
+
+ zone->managed_pages += delta;
+ zone->present_pages += delta;
+ zone->zone_pgdat->node_present_pages += delta;
+
+ zone_span_writeunlock(zone);
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+}
+
+/* - must be called under lock_memory_hotplug() */
+/* TODO: avoid iterating over all PFNs. */
+void dnuma_online_required_nodes_and_zones(struct memlayout *new_ml)
+{
+ struct rangemap_entry *rme;
+ ml_for_each_range(new_ml, rme) {
+ unsigned long pfn;
+ int nid = rme->nid;
+
+ if (!node_online(nid)) {
+ pr_info("onlining node %d [start]\n", nid);
+
+ /* XXX: somewhere in here do a memory online notify: we
+ * aren't really onlining memory, but some code uses
+ * memory online notifications to tell if new nodes
+ * have been created.
+ *
+ * Also note that the notifyers expect to be able to do
+ * allocations, ie we must allow for might_sleep() */
+ {
+ int ret;
+
+ /* memory_notify() expects:
+ * - to add pages at the same time
+ * - to add zones at the same time
+ * We can do neither of these things.
+ *
+ * FIXME: Right now we just set the things
+ * needed by the slub handler.
+ */
+ struct memory_notify arg = {
+ .status_change_nid_normal = nid,
+ };
+
+ ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = notifier_to_errno(ret);
+ if (WARN_ON(ret)) {
+ /* XXX: other stuff will bug out if we
+ * keep going, need to actually cancel
+ * memlayout changes
+ */
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ }
+ }
+
+ /* Consult hotadd_new_pgdat() */
+ __mem_online_node(nid);
+ if (!node_online(nid)) {
+ pr_alert("node %d not online after onlining\n", nid);
+ }
+
+ pr_info("onlining node %d [complete]\n", nid);
+ }
+
+ /* Determine the zones required */
+ for (pfn = rme->pfn_start; pfn <= rme->pfn_end; pfn++) {
+ struct zone *zone;
+ if (!pfn_valid(pfn))
+ continue;
+
+ zone = nid_zone(nid, page_zonenum(pfn_to_page(pfn)));
+ /* XXX: we (dnuma paths) can handle this (there will
+ * just be quite a few WARNS in the logs), but if we
+ * are indicating error above, should we bail out here
+ * as well? */
+ WARN_ON(ensure_zone_is_initialized(zone, 0, 0));
+ }
+ }
+}
+
+/*
+ * Cannot be folded into dnuma_move_unallocated_pages() because unmarked pages
+ * could be freed back into the zone as dnuma_move_unallocated_pages() was in
+ * the process of iterating over it.
+ */
+void dnuma_mark_page_range(struct memlayout *new_ml)
+{
+ struct rangemap_entry *rme;
+ ml_for_each_range(new_ml, rme) {
+ unsigned long pfn;
+ for (pfn = rme->pfn_start; pfn <= rme->pfn_end; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+ /* FIXME: should we be skipping compound / buddied
+ * pages? */
+ /* FIXME: if PageReserved(), can we just poke the nid
+ * directly? Should we? */
+ SetPageLookupNode(pfn_to_page(pfn));
+ }
+ }
+}
+
+#if 0
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_set_state(node, N_NORMAL_MEMORY);
+
+ if (arg->status_change_nid_high >= 0)
+ node_set_state(node, N_HIGH_MEMORY);
+
+ node_set_state(node, N_MEMORY);
+}
+#endif
+
+void dnuma_post_free_to_new_zone(struct page *page, int order)
+{
+ adjust_zone_present_pages(page_zone(page), (1 << order));
+}
+
+static void dnuma_prior_return_to_new_zone(struct page *page, int order,
+ struct zone *dest_zone,
+ int dest_nid)
+{
+ int i;
+ unsigned long pfn = page_to_pfn(page);
+
+ grow_pgdat_and_zone(dest_zone, pfn, pfn + (1UL << order));
+
+ for (i = 0; i < 1UL << order; i++)
+ set_page_node(&page[i], dest_nid);
+}
+
+static void clear_lookup_node(struct page *page, int order)
+{
+ int i;
+ for (i = 0; i < 1UL << order; i++)
+ ClearPageLookupNode(&page[i]);
+}
+
+/* Does not assume it is called with any locking (but can be called with zone
+ * locks held, if needed) */
+void dnuma_prior_free_to_new_zone(struct page *page, int order,
+ struct zone *dest_zone,
+ int dest_nid)
+{
+ struct zone *curr_zone = page_zone(page);
+
+ /* XXX: Fiddle with 1st zone's locks */
+ adjust_zone_present_pages(curr_zone, -(1UL << order));
+
+ /* XXX: fiddles with 2nd zone's locks */
+ dnuma_prior_return_to_new_zone(page, order, dest_zone, dest_nid);
+}
+
+/* must be called with zone->lock held and memlayout's update_lock held */
+static void remove_free_pages_from_zone(struct zone *zone, struct page *page, int order)
+{
+ /* zone free stats */
+ zone->free_area[order].nr_free--;
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+ adjust_zone_present_pages(zone, -(1UL << order));
+
+ list_del(&page->lru);
+ __ClearPageBuddy(page);
+
+ /* Allowed because we hold the memlayout update_lock. */
+ clear_lookup_node(page, order);
+
+ /* XXX: can we shrink spanned_pages & start_pfn without too much work?
+ * - not crutial because having a
+ * larger-than-necessary span simply means that more
+ * PFNs are iterated over.
+ * - would be nice to be able to do this to cut down
+ * on overhead caused by PFN iterators.
+ */
+}
+
+/*
+ * __ref is to allow (__meminit) zone_pcp_update(), which we will have because
+ * DYNAMIC_NUMA depends on MEMORY_HOTPLUG (and all the MEMORY_HOTPLUG comments
+ * indicate __meminit is allowed when they are enabled).
+ */
+static void __ref add_free_page_to_node(int dest_nid, struct page *page, int order)
+{
+ bool need_zonelists_rebuild = false;
+ struct zone *dest_zone = nid_zone(dest_nid, page_zonenum(page));
+ VM_BUG_ON(!zone_is_initialized(dest_zone));
+
+ if (zone_is_empty(dest_zone))
+ need_zonelists_rebuild = true;
+
+ /* Add page to new zone */
+ dnuma_prior_return_to_new_zone(page, order, dest_zone, dest_nid);
+ return_pages_to_zone(page, order, dest_zone);
+ dnuma_post_free_to_new_zone(page, order);
+
+ /* XXX: fixme, there are other states that need fixing up */
+ if (!node_state(dest_nid, N_MEMORY))
+ node_set_state(dest_nid, N_MEMORY);
+
+ if (need_zonelists_rebuild) {
+ /* XXX: also does stop_machine() */
+ //zone_pcp_reset(zone);
+ /* XXX: why is this locking actually needed? */
+ mutex_lock(&zonelists_mutex);
+ //build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL, dest_zone);
+ mutex_unlock(&zonelists_mutex);
+ } else
+ /* FIXME: does stop_machine() after EVERY SINGLE PAGE */
+ /* XXX: this is probably wrong. What does "update" actually
+ * indicate in zone_pcp terms? */
+ zone_pcp_update(dest_zone);
+}
+
+static struct rangemap_entry *add_split_pages_to_zones(
+ struct rangemap_entry *first_rme,
+ struct page *page, int order)
+{
+ int i;
+ struct rangemap_entry *rme = first_rme;
+ for (i = 0; i < (1 << order); i++) {
+ unsigned long pfn = page_to_pfn(page);
+ while (pfn > rme->pfn_end) {
+ rme = rme_next(rme);
+ }
+
+ add_free_page_to_node(rme->nid, page + i, 0);
+ }
+
+ return rme;
+}
+
+void dnuma_move_free_pages(struct memlayout *new_ml)
+{
+ /* FIXME: how does this removal of pages from a zone interact with
+ * migrate types? ISOLATION? */
+ struct rangemap_entry *rme;
+ ml_for_each_range(new_ml, rme) {
+ unsigned long pfn = rme->pfn_start;
+ int range_nid;
+ struct page *page;
+new_rme:
+ range_nid = rme->nid;
+
+ for (; pfn <= rme->pfn_end; pfn++) {
+ struct zone *zone;
+ int page_nid, order;
+ unsigned long flags, last_pfn, first_pfn;
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+#if 0
+ /* XXX: can we ensure this is safe? Pages marked
+ * reserved could be freed into the page allocator if
+ * they mark memory areas that were allocated via
+ * earlier allocators. */
+ if (PageReserved(page)) {
+ set_page_node(page, range_nid);
+ /* TODO: adjust spanned_pages & present_pages & start_pfn. */
+ }
+#endif
+
+ /* Currently allocated, will be fixed up when freed. */
+ if (!PageBuddy(page))
+ continue;
+
+ page_nid = page_to_nid(page);
+ if (page_nid == range_nid)
+ continue;
+
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Someone allocated it since we last checked. It will
+ * be fixed up when it is freed */
+ if (!PageBuddy(page))
+ goto skip_unlock;
+
+ /* It has already been transplanted "somewhere",
+ * somewhere should be the proper zone. */
+ if (page_zone(page) != zone) {
+ VM_BUG_ON(zone != nid_zone(range_nid, page_zonenum(page)));
+ goto skip_unlock;
+ }
+
+ order = page_order(page);
+ first_pfn = pfn & ~((1 << order) - 1);
+ last_pfn = pfn | ((1 << order) - 1);
+ if (WARN(pfn != first_pfn, "pfn %05lx is not first_pfn %05lx\n",
+ pfn, first_pfn)) {
+ pfn = last_pfn;
+ goto skip_unlock;
+ }
+
+ if (last_pfn > rme->pfn_end) {
+ /* this higher order page doesn't fit into the
+ * current range even though it starts there.
+ */
+ pr_warn("high-order page from pfn %05lx to %05lx extends beyond end of rme {%05lx - %05lx}:%d\n",
+ first_pfn, last_pfn,
+ rme->pfn_start, rme->pfn_end,
+ rme->nid);
+
+ remove_free_pages_from_zone(zone, page, order);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ rme = add_split_pages_to_zones(rme, page, order);
+ pfn = last_pfn + 1;
+ goto new_rme;
+ }
+
+ remove_free_pages_from_zone(zone, page, order);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ add_free_page_to_node(range_nid, page, order);
+ pfn = last_pfn;
+ continue;
+skip_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ }
+}
diff --git a/mm/memlayout.c b/mm/memlayout.c
new file mode 100644
index 0000000..69222ac
--- /dev/null
+++ b/mm/memlayout.c
@@ -0,0 +1,238 @@
+/*
+ * memlayout - provides a mapping of PFN ranges to nodes with the requirements
+ * that looking up a node from a PFN is fast, and changes to the mapping will
+ * occour relatively infrequently.
+ *
+ */
+#define pr_fmt(fmt) "memlayout: " fmt
+
+#include <linux/dnuma.h>
+#include <linux/export.h>
+#include <linux/memblock.h>
+#include <linux/printk.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+/* protected by memlayout_lock */
+__rcu struct memlayout *pfn_to_node_map;
+DEFINE_MUTEX(memlayout_lock);
+
+static void free_rme_tree(struct rb_root *root)
+{
+ struct rangemap_entry *pos, *n;
+ rbtree_postorder_for_each_entry_safe(pos, n, root, node) {
+ kfree(pos);
+ }
+}
+
+static void ml_destroy_mem(struct memlayout *ml)
+{
+ if (!ml)
+ return;
+ free_rme_tree(&ml->root);
+ kfree(ml);
+}
+
+static int find_insertion_point(struct memlayout *ml, unsigned long pfn_start,
+ unsigned long pfn_end, int nid, struct rb_node ***o_new,
+ struct rb_node **o_parent)
+{
+ struct rb_node **new = &ml->root.rb_node, *parent = NULL;
+ struct rangemap_entry *rme;
+ pr_debug("adding range: {%lX-%lX}:%d\n", pfn_start, pfn_end, nid);
+ while (*new) {
+ rme = rb_entry(*new, typeof(*rme), node);
+
+ parent = *new;
+ if (pfn_end < rme->pfn_start && pfn_start < rme->pfn_end)
+ new = &((*new)->rb_left);
+ else if (pfn_start > rme->pfn_end && pfn_end > rme->pfn_end)
+ new = &((*new)->rb_right);
+ else {
+ /* an embedded region, need to use an interval or
+ * sequence tree. */
+ pr_warn("tried to embed {%lX,%lX}:%d inside {%lX-%lX}:%d\n",
+ pfn_start, pfn_end, nid,
+ rme->pfn_start, rme->pfn_end, rme->nid);
+ return 1;
+ }
+ }
+
+ *o_new = new;
+ *o_parent = parent;
+ return 0;
+}
+
+int memlayout_new_range(struct memlayout *ml, unsigned long pfn_start,
+ unsigned long pfn_end, int nid)
+{
+ struct rb_node **new, *parent;
+ struct rangemap_entry *rme;
+
+ if (WARN_ON(nid < 0))
+ return -EINVAL;
+ if (WARN_ON(nid >= MAX_NUMNODES))
+ return -EINVAL;
+
+ if (find_insertion_point(ml, pfn_start, pfn_end, nid, &new, &parent))
+ return 1;
+
+ rme = kmalloc(sizeof(*rme), GFP_KERNEL);
+ if (!rme)
+ return -ENOMEM;
+
+ rme->pfn_start = pfn_start;
+ rme->pfn_end = pfn_end;
+ rme->nid = nid;
+
+ rb_link_node(&rme->node, parent, new);
+ rb_insert_color(&rme->node, &ml->root);
+ return 0;
+}
+
+static inline bool rme_bounds_pfn(struct rangemap_entry *rme, unsigned long pfn)
+{
+ return rme->pfn_start <= pfn && pfn <= rme->pfn_end;
+}
+
+int memlayout_pfn_to_nid(unsigned long pfn)
+{
+ struct rb_node *node;
+ struct memlayout *ml;
+ struct rangemap_entry *rme;
+ rcu_read_lock();
+ ml = rcu_dereference(pfn_to_node_map);
+ if (!ml || (ml->type == ML_INITIAL))
+ goto out;
+
+ rme = ACCESS_ONCE(ml->cache);
+ if (rme && rme_bounds_pfn(rme, pfn)) {
+ rcu_read_unlock();
+ return rme->nid;
+ }
+
+ node = ml->root.rb_node;
+ while (node) {
+ struct rangemap_entry *rme = rb_entry(node, typeof(*rme), node);
+ bool greater_than_start = rme->pfn_start <= pfn;
+ bool less_than_end = pfn <= rme->pfn_end;
+
+ if (greater_than_start && !less_than_end)
+ node = node->rb_right;
+ else if (less_than_end && !greater_than_start)
+ node = node->rb_left;
+ else {
+ /* greater_than_start && less_than_end.
+ * the case (!greater_than_start && !less_than_end)
+ * is impossible */
+ int nid = rme->nid;
+ ACCESS_ONCE(ml->cache) = rme;
+ rcu_read_unlock();
+ return nid;
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return NUMA_NO_NODE;
+}
+
+void memlayout_destroy(struct memlayout *ml)
+{
+ ml_destroy_mem(ml);
+}
+
+struct memlayout *memlayout_create(enum memlayout_type type)
+{
+ struct memlayout *ml;
+
+ if (WARN_ON(type < 0 || type >= ML_NUM_TYPES))
+ return NULL;
+
+ ml = kmalloc(sizeof(*ml), GFP_KERNEL);
+ if (!ml)
+ return NULL;
+
+ ml->root = RB_ROOT;
+ ml->type = type;
+ ml->cache = NULL;
+
+ return ml;
+}
+
+void memlayout_commit(struct memlayout *ml)
+{
+ struct memlayout *old_ml;
+
+ if (ml->type == ML_INITIAL) {
+ if (WARN(dnuma_has_memlayout(), "memlayout marked first is not first, ignoring.\n")) {
+ memlayout_destroy(ml);
+ return;
+ }
+
+ mutex_lock(&memlayout_lock);
+ rcu_assign_pointer(pfn_to_node_map, ml);
+ mutex_unlock(&memlayout_lock);
+ return;
+ }
+
+ lock_memory_hotplug();
+ dnuma_online_required_nodes_and_zones(ml);
+ unlock_memory_hotplug();
+
+ mutex_lock(&memlayout_lock);
+ old_ml = rcu_dereference_protected(pfn_to_node_map,
+ mutex_is_locked(&memlayout_lock));
+
+ rcu_assign_pointer(pfn_to_node_map, ml);
+
+ synchronize_rcu();
+ memlayout_destroy(old_ml);
+
+ /* Must be called only after the new value for pfn_to_node_map has
+ * propogated to all tasks, otherwise some pages may lookup the old
+ * pfn_to_node_map on free & not transplant themselves to their new-new
+ * node. */
+ dnuma_mark_page_range(ml);
+
+ /* Do this after the free path is set up so that pages are free'd into
+ * their "new" zones so that after this completes, no free pages in the
+ * wrong zone remain. */
+ dnuma_move_free_pages(ml);
+
+ /* All new _non pcp_ page allocations now match the memlayout*/
+ drain_all_pages();
+ /* All new page allocations now match the memlayout */
+
+ mutex_unlock(&memlayout_lock);
+}
+
+/*
+ * The default memlayout global initializer, using memblock to determine affinities
+ * reqires: slab_is_available() && memblock is not (yet) freed.
+ * sleeps: definitely: memlayout_commit() -> synchronize_rcu()
+ * potentially: kmalloc()
+ */
+__weak __meminit
+void memlayout_global_init(void)
+{
+ int i, nid, errs = 0;
+ unsigned long start, end;
+ struct memlayout *ml = memlayout_create(ML_INITIAL);
+ if (WARN_ON(!ml))
+ return;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+ int r = memlayout_new_range(ml, start, end - 1, nid);
+ if (r) {
+ pr_err("failed to add range [%05lx, %05lx] in node %d to mapping\n",
+ start, end, nid);
+ errs++;
+ } else
+ pr_devel("added range [%05lx, %05lx] in node %d\n",
+ start, end, nid);
+ }
+
+ memlayout_commit(ml);
+}
--
1.8.1.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2013-02-28 21:27 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-02-28 2:41 [RFC] DNUMA: Runtime NUMA memory layout reconfiguration Cody P Schafer
2013-02-28 20:44 ` [RFC][PATCH 00/24] " Cody P Schafer
2013-02-28 20:44 ` [PATCH 01/24] XXX: reduce MAX_PHYSADDR_BITS & MAX_PHYSMEM_BITS in PAE Cody P Schafer
2013-02-28 20:44 ` [PATCH 02/24] XXX: x86/Kconfig: simplify NUMA config for NUMA_EMU on X86_32 Cody P Schafer
2013-02-28 20:44 ` [PATCH 03/24] XXX: memory_hotplug locking note in online_pages Cody P Schafer
2013-02-28 20:44 ` [PATCH 04/24] rbtree: add postorder iteration functions Cody P Schafer
2013-02-28 20:44 ` [PATCH 05/24] rbtree: add rbtree_postorder_for_each_entry_safe() helper Cody P Schafer
2013-02-28 20:44 ` [PATCH 06/24] mm/memory_hotplug: factor out zone+pgdat growth Cody P Schafer
2013-02-28 20:44 ` [PATCH 07/24] memory_hotplug: export ensure_zone_is_initialized() in mm/internal.h Cody P Schafer
2013-02-28 20:44 ` [PATCH 08/24] mm/memory_hotplug: use {pgdat,zone}_is_empty() when resizing zones & pgdats Cody P Schafer
2013-02-28 20:44 ` [PATCH 09/24] mm: add nid_zone() helper Cody P Schafer
2013-02-28 21:26 ` [PATCH 10/24] page_alloc: add return_pages_to_zone() when DYNAMIC_NUMA is enabled Cody P Schafer
2013-02-28 21:26 ` [PATCH 11/24] page_alloc: in move_freepages(), skip pages instead of VM_BUG on node differences Cody P Schafer
2013-02-28 21:26 ` [PATCH 12/24] page_alloc: when dynamic numa is enabled, don't check that all pages in a block belong to the same zone Cody P Schafer
2013-02-28 21:26 ` [PATCH 13/24] page-flags dnuma: reserve a pageflag for determining if a page needs a node lookup Cody P Schafer
2013-02-28 21:26 ` [PATCH 14/24] memory_hotplug: factor out locks in mem_online_cpu() Cody P Schafer
2013-02-28 21:26 ` Cody P Schafer [this message]
2013-02-28 21:26 ` [PATCH 16/24] mm: memlayout+dnuma: add debugfs interface Cody P Schafer
2013-02-28 21:26 ` [PATCH 17/24] page_alloc: use dnuma to transplant newly freed pages in __free_pages_ok() Cody P Schafer
2013-02-28 21:26 ` [PATCH 18/24] page_alloc: use dnuma to transplant newly freed pages in free_hot_cold_page() Cody P Schafer
2013-02-28 21:26 ` [PATCH 19/24] page_alloc: transplant pages that are being flushed from the per-cpu lists Cody P Schafer
2013-02-28 21:26 ` [PATCH 20/24] x86: memlayout: add a arch specific inital memlayout setter Cody P Schafer
2013-02-28 21:57 ` [PATCH 21/24] init/main: call memlayout_global_init() in start_kernel() Cody P Schafer
2013-02-28 21:57 ` [PATCH 22/24] dnuma: memlayout: add memory_add_physaddr_to_nid() for memory_hotplug Cody P Schafer
2013-02-28 21:57 ` [PATCH 23/24] x86/mm/numa: when dnuma is enabled, use memlayout to handle memory hotplug's physaddr_to_nid Cody P Schafer
2013-02-28 21:57 ` [PATCH 24/24] XXX: x86/mm/numa: Avoid spamming warnings due to lack of cpu reconfig Cody P Schafer
2013-04-04 5:28 ` [RFC][PATCH 00/24] DNUMA: Runtime NUMA memory layout reconfiguration Simon Jeons
2013-04-04 19:07 ` Cody P Schafer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1362086781-16725-6-git-send-email-cody@linux.vnet.ibm.com \
--to=cody@linux.vnet.ibm.com \
--cc=dave@linux.vnet.ibm.com \
--cc=linux-mm@kvack.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).