diff for duplicates of <20120619180646.GN4633@redhat.com> diff --git a/a/1.txt b/N1/1.txt index 56d0c49..254f472 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -49,3 +49,654 @@ details with Johannes during the MM summit but it's some work to implement it). === +>From 17e1cbc02c1b41037248d9952179ff293a287d58 Mon Sep 17 00:00:00 2001 +From: Andrea Arcangeli <aarcange@redhat.com> +Date: Tue, 19 Jun 2012 18:55:25 +0200 +Subject: [PATCH] autonuma: shrink the per-page page_autonuma struct size + +>From 32 to 12 bytes, so the AutoNUMA memory footprint is reduced to +0.29% of RAM. + +This however will fail to migrate pages above a 16 Terabyte offset +from the start of each node (migration failure isn't fatal, simply +those pages will not follow the CPU, a warning will be printed in the +log just once in that case). + +AutoNUMA will also fail to build if there are more than (2**15)-1 +nodes supported by the MAX_NUMNODES at build time (it would be easy to +relax it to (2**16)-1 nodes without increasing the memory footprint, +but it's not even worth it, so let's keep the negative space reserved +for now). + +This means the max RAM configuration fully supported by AutoNUMA +becomes AUTONUMA_LIST_MAX_PFN_OFFSET multiplied by 32767 nodes +multiplied by the PAGE_SIZE (assume 4096 here, but for some archs it's +bigger). + +4096*32767*(0xffffffff-3)>>(10*5) = 511 PetaBytes. + +Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> +--- + include/linux/autonuma_list.h | 94 ++++++++++++++++++++++ + include/linux/autonuma_types.h | 48 +++++++----- + include/linux/mmzone.h | 3 +- + include/linux/page_autonuma.h | 2 +- + mm/Makefile | 2 +- + mm/autonuma.c | 75 +++++++++++++----- + mm/autonuma_list.c | 167 ++++++++++++++++++++++++++++++++++++++++ + mm/page_autonuma.c | 15 ++-- + 8 files changed, 355 insertions(+), 51 deletions(-) + create mode 100644 include/linux/autonuma_list.h + create mode 100644 mm/autonuma_list.c + +diff --git a/include/linux/autonuma_list.h b/include/linux/autonuma_list.h +new file mode 100644 +index 0000000..0f338e9 +--- /dev/null ++++ b/include/linux/autonuma_list.h +@@ -0,0 +1,94 @@ ++#ifndef __AUTONUMA_LIST_H ++#define __AUTONUMA_LIST_H ++ ++#include <linux/types.h> ++#include <linux/kernel.h> ++ ++typedef uint32_t autonuma_list_entry; ++#define AUTONUMA_LIST_MAX_PFN_OFFSET (AUTONUMA_LIST_HEAD-3) ++#define AUTONUMA_LIST_POISON1 (AUTONUMA_LIST_HEAD-2) ++#define AUTONUMA_LIST_POISON2 (AUTONUMA_LIST_HEAD-1) ++#define AUTONUMA_LIST_HEAD ((uint32_t)UINT_MAX) ++ ++struct autonuma_list_head { ++ autonuma_list_entry anl_next_pfn; ++ autonuma_list_entry anl_prev_pfn; ++}; ++ ++static inline void AUTONUMA_INIT_LIST_HEAD(struct autonuma_list_head *anl) ++{ ++ anl->anl_next_pfn = AUTONUMA_LIST_HEAD; ++ anl->anl_prev_pfn = AUTONUMA_LIST_HEAD; ++} ++ ++/* abstraction conversion methods */ ++extern struct page *autonuma_list_entry_to_page(int nid, ++ autonuma_list_entry pfn_offset); ++extern autonuma_list_entry autonuma_page_to_list_entry(int page_nid, ++ struct page *page); ++extern struct autonuma_list_head *__autonuma_list_head(int page_nid, ++ struct autonuma_list_head *head, ++ autonuma_list_entry pfn_offset); ++ ++extern bool __autonuma_list_add(int page_nid, ++ struct page *page, ++ struct autonuma_list_head *head, ++ autonuma_list_entry prev, ++ autonuma_list_entry next); ++ ++/* ++ * autonuma_list_add - add a new entry ++ * ++ * Insert a new entry after the specified head. ++ */ ++static inline bool autonuma_list_add(int page_nid, ++ struct page *page, ++ autonuma_list_entry entry, ++ struct autonuma_list_head *head) ++{ ++ struct autonuma_list_head *entry_head; ++ entry_head = __autonuma_list_head(page_nid, head, entry); ++ return __autonuma_list_add(page_nid, page, head, ++ entry, entry_head->anl_next_pfn); ++} ++ ++/* ++ * autonuma_list_add_tail - add a new entry ++ * ++ * Insert a new entry before the specified head. ++ * This is useful for implementing queues. ++ */ ++static inline bool autonuma_list_add_tail(int page_nid, ++ struct page *page, ++ autonuma_list_entry entry, ++ struct autonuma_list_head *head) ++{ ++ struct autonuma_list_head *entry_head; ++ entry_head = __autonuma_list_head(page_nid, head, entry); ++ return __autonuma_list_add(page_nid, page, head, ++ entry_head->anl_prev_pfn, entry); ++} ++ ++/* ++ * autonuma_list_del - deletes entry from list. ++ * @entry: the element to delete from the list. ++ */ ++extern void autonuma_list_del(int page_nid, ++ struct autonuma_list_head *entry, ++ struct autonuma_list_head *head); ++ ++extern bool autonuma_list_empty(const struct autonuma_list_head *head); ++ ++#if 0 /* not needed so far */ ++/* ++ * autonuma_list_is_singular - tests whether a list has just one entry. ++ * @head: the list to test. ++ */ ++static inline int autonuma_list_is_singular(const struct autonuma_list_head *head) ++{ ++ return !autonuma_list_empty(head) && ++ (head->anl_next_pfn == head->anl_prev_pfn); ++} ++#endif ++ ++#endif /* __AUTONUMA_LIST_H */ +diff --git a/include/linux/autonuma_types.h b/include/linux/autonuma_types.h +index 6662990..1abde9c5 100644 +--- a/include/linux/autonuma_types.h ++++ b/include/linux/autonuma_types.h +@@ -4,6 +4,7 @@ + #ifdef CONFIG_AUTONUMA + + #include <linux/numa.h> ++#include <linux/autonuma_list.h> + + /* + * Per-mm (process) structure dynamically allocated only if autonuma +@@ -45,15 +46,36 @@ struct task_autonuma { + /* + * Per page (or per-pageblock) structure dynamically allocated only if + * autonuma is not impossible. ++ * ++ * This structure takes 12 bytes per page for all architectures. There ++ * are two constraints to make this work: ++ * ++ * 1) the build will abort if * MAX_NUMNODES is too big according to ++ * the #error check below ++ * ++ * 2) AutoNUMA will not succeed to insert into the migration queue any ++ * page whose pfn offset value (offset with respect to the first ++ * pfn of the node) is bigger than AUTONUMA_LIST_MAX_PFN_OFFSET ++ * (NOTE: AUTONUMA_LIST_MAX_PFN_OFFSET is still a valid pfn offset ++ * value). This means with huge node sizes and small PAGE_SIZE, ++ * some pages may not be allowed to be migrated. + */ + struct page_autonuma { + /* + * To modify autonuma_last_nid lockless the architecture, + * needs SMP atomic granularity < sizeof(long), not all archs +- * have that, notably some alpha. Archs without that requires ++ * have that, notably some ancient alpha (but none of those ++ * should run in NUMA systems). Archs without that requires + * autonuma_last_nid to be a long. + */ +-#if BITS_PER_LONG > 32 ++#if MAX_NUMNODES > 32767 ++ /* ++ * Verify at build time that int16_t for autonuma_migrate_nid ++ * and autonuma_last_nid won't risk to overflow, max allowed ++ * nid value is (2**15)-1. ++ */ ++#error "too many nodes" ++#endif + /* + * autonuma_migrate_nid is -1 if the page_autonuma structure + * is not linked into any +@@ -63,7 +85,7 @@ struct page_autonuma { + * page_nid is the nid that the page (referenced by the + * page_autonuma structure) belongs to. + */ +- int autonuma_migrate_nid; ++ int16_t autonuma_migrate_nid; + /* + * autonuma_last_nid records which is the NUMA nid that tried + * to access this page at the last NUMA hinting page fault. +@@ -72,28 +94,14 @@ struct page_autonuma { + * it will make different threads trashing on the same pages, + * converge on the same NUMA node (if possible). + */ +- int autonuma_last_nid; +-#else +-#if MAX_NUMNODES >= 32768 +-#error "too many nodes" +-#endif +- short autonuma_migrate_nid; +- short autonuma_last_nid; +-#endif ++ int16_t autonuma_last_nid; ++ + /* + * This is the list node that links the page (referenced by + * the page_autonuma structure) in the + * &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid] lru. + */ +- struct list_head autonuma_migrate_node; +- +- /* +- * To find the page starting from the autonuma_migrate_node we +- * need a backlink. +- * +- * FIXME: drop it; +- */ +- struct page *page; ++ struct autonuma_list_head autonuma_migrate_node; + }; + + extern int alloc_task_autonuma(struct task_struct *tsk, +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index ed5b0c0..acefdfa 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -17,6 +17,7 @@ + #include <linux/pageblock-flags.h> + #include <generated/bounds.h> + #include <linux/atomic.h> ++#include <linux/autonuma_list.h> + #include <asm/page.h> + + /* Free memory management - zoned buddy allocator. */ +@@ -710,7 +711,7 @@ typedef struct pglist_data { + * <linux/page_autonuma.h> and the below field must remain the + * last one of this structure. + */ +- struct list_head autonuma_migrate_head[0]; ++ struct autonuma_list_head autonuma_migrate_head[0]; + #endif + } pg_data_t; + +diff --git a/include/linux/page_autonuma.h b/include/linux/page_autonuma.h +index bc7a629..e78beda 100644 +--- a/include/linux/page_autonuma.h ++++ b/include/linux/page_autonuma.h +@@ -53,7 +53,7 @@ extern void __init sparse_early_page_autonuma_alloc_node(struct page_autonuma ** + /* inline won't work here */ + #define autonuma_pglist_data_size() (sizeof(struct pglist_data) + \ + (autonuma_impossible() ? 0 : \ +- sizeof(struct list_head) * \ ++ sizeof(struct autonuma_list_head) * \ + num_possible_nodes())) + + #endif /* _LINUX_PAGE_AUTONUMA_H */ +diff --git a/mm/Makefile b/mm/Makefile +index a4d8354..4aa90d4 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -33,7 +33,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o + obj-$(CONFIG_HAS_DMA) += dmapool.o + obj-$(CONFIG_HUGETLBFS) += hugetlb.o + obj-$(CONFIG_NUMA) += mempolicy.o +-obj-$(CONFIG_AUTONUMA) += autonuma.o page_autonuma.o ++obj-$(CONFIG_AUTONUMA) += autonuma.o page_autonuma.o autonuma_list.o + obj-$(CONFIG_SPARSEMEM) += sparse.o + obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o + obj-$(CONFIG_SLOB) += slob.o +diff --git a/mm/autonuma.c b/mm/autonuma.c +index 9834f5d..8aed9af 100644 +--- a/mm/autonuma.c ++++ b/mm/autonuma.c +@@ -89,12 +89,21 @@ void autonuma_migrate_split_huge_page(struct page *page, + VM_BUG_ON(nid < -1); + VM_BUG_ON(page_tail_autonuma->autonuma_migrate_nid != -1); + if (nid >= 0) { +- VM_BUG_ON(page_to_nid(page) != page_to_nid(page_tail)); ++ int page_nid = page_to_nid(page); ++ struct autonuma_list_head *head; ++ autonuma_list_entry entry; ++ entry = autonuma_page_to_list_entry(page_nid, page); ++ head = &NODE_DATA(nid)->autonuma_migrate_head[page_nid]; ++ VM_BUG_ON(page_nid != page_to_nid(page_tail)); ++ VM_BUG_ON(page_nid == nid); + + compound_lock(page_tail); + autonuma_migrate_lock(nid); +- list_add_tail(&page_tail_autonuma->autonuma_migrate_node, +- &page_autonuma->autonuma_migrate_node); ++ if (!autonuma_list_add_tail(page_nid, ++ page_tail, ++ entry, ++ head)) ++ BUG(); + autonuma_migrate_unlock(nid); + + page_tail_autonuma->autonuma_migrate_nid = nid; +@@ -119,8 +128,15 @@ void __autonuma_migrate_page_remove(struct page *page, + VM_BUG_ON(nid < -1); + if (nid >= 0) { + int numpages = hpage_nr_pages(page); ++ int page_nid = page_to_nid(page); ++ struct autonuma_list_head *head; ++ VM_BUG_ON(nid == page_nid); ++ head = &NODE_DATA(nid)->autonuma_migrate_head[page_nid]; ++ + autonuma_migrate_lock(nid); +- list_del(&page_autonuma->autonuma_migrate_node); ++ autonuma_list_del(page_nid, ++ &page_autonuma->autonuma_migrate_node, ++ head); + NODE_DATA(nid)->autonuma_nr_migrate_pages -= numpages; + autonuma_migrate_unlock(nid); + +@@ -139,6 +155,8 @@ static void __autonuma_migrate_page_add(struct page *page, + int numpages; + unsigned long nr_migrate_pages; + wait_queue_head_t *wait_queue; ++ struct autonuma_list_head *head; ++ bool added; + + VM_BUG_ON(dst_nid >= MAX_NUMNODES); + VM_BUG_ON(dst_nid < -1); +@@ -155,25 +173,33 @@ static void __autonuma_migrate_page_add(struct page *page, + VM_BUG_ON(nid >= MAX_NUMNODES); + VM_BUG_ON(nid < -1); + if (nid >= 0) { ++ VM_BUG_ON(nid == page_nid); ++ head = &NODE_DATA(nid)->autonuma_migrate_head[page_nid]; ++ + autonuma_migrate_lock(nid); +- list_del(&page_autonuma->autonuma_migrate_node); ++ autonuma_list_del(page_nid, ++ &page_autonuma->autonuma_migrate_node, ++ head); + NODE_DATA(nid)->autonuma_nr_migrate_pages -= numpages; + autonuma_migrate_unlock(nid); + } + ++ head = &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid]; ++ + autonuma_migrate_lock(dst_nid); +- list_add(&page_autonuma->autonuma_migrate_node, +- &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid]); +- NODE_DATA(dst_nid)->autonuma_nr_migrate_pages += numpages; ++ added = autonuma_list_add(page_nid, page, AUTONUMA_LIST_HEAD, head); ++ if (added) ++ NODE_DATA(dst_nid)->autonuma_nr_migrate_pages += numpages; + nr_migrate_pages = NODE_DATA(dst_nid)->autonuma_nr_migrate_pages; + + autonuma_migrate_unlock(dst_nid); + +- page_autonuma->autonuma_migrate_nid = dst_nid; ++ if (added) ++ page_autonuma->autonuma_migrate_nid = dst_nid; + + compound_unlock_irqrestore(page, flags); + +- if (!autonuma_migrate_defer()) { ++ if (added && !autonuma_migrate_defer()) { + wait_queue = &NODE_DATA(dst_nid)->autonuma_knuma_migrated_wait; + if (nr_migrate_pages >= pages_to_migrate && + nr_migrate_pages - numpages < pages_to_migrate && +@@ -813,7 +839,7 @@ static int isolate_migratepages(struct list_head *migratepages, + struct pglist_data *pgdat) + { + int nr = 0, nid; +- struct list_head *heads = pgdat->autonuma_migrate_head; ++ struct autonuma_list_head *heads = pgdat->autonuma_migrate_head; + + /* FIXME: THP balancing, restart from last nid */ + for_each_online_node(nid) { +@@ -825,10 +851,10 @@ static int isolate_migratepages(struct list_head *migratepages, + cond_resched(); + VM_BUG_ON(numa_node_id() != pgdat->node_id); + if (nid == pgdat->node_id) { +- VM_BUG_ON(!list_empty(&heads[nid])); ++ VM_BUG_ON(!autonuma_list_empty(&heads[nid])); + continue; + } +- if (list_empty(&heads[nid])) ++ if (autonuma_list_empty(&heads[nid])) + continue; + /* some page wants to go to this pgdat */ + /* +@@ -840,22 +866,29 @@ static int isolate_migratepages(struct list_head *migratepages, + * irqs. + */ + autonuma_migrate_lock_irq(pgdat->node_id); +- if (list_empty(&heads[nid])) { ++ if (autonuma_list_empty(&heads[nid])) { + autonuma_migrate_unlock_irq(pgdat->node_id); + continue; + } +- page_autonuma = list_entry(heads[nid].prev, +- struct page_autonuma, +- autonuma_migrate_node); +- page = page_autonuma->page; ++ page = autonuma_list_entry_to_page(nid, ++ heads[nid].anl_prev_pfn); ++ page_autonuma = lookup_page_autonuma(page); + if (unlikely(!get_page_unless_zero(page))) { ++ int page_nid = page_to_nid(page); ++ struct autonuma_list_head *entry_head; ++ VM_BUG_ON(nid == page_nid); ++ + /* + * Is getting freed and will remove self from the + * autonuma list shortly, skip it for now. + */ +- list_del(&page_autonuma->autonuma_migrate_node); +- list_add(&page_autonuma->autonuma_migrate_node, +- &heads[nid]); ++ entry_head = &page_autonuma->autonuma_migrate_node; ++ autonuma_list_del(page_nid, entry_head, ++ &heads[nid]); ++ if (!autonuma_list_add(page_nid, page, ++ AUTONUMA_LIST_HEAD, ++ &heads[nid])) ++ BUG(); + autonuma_migrate_unlock_irq(pgdat->node_id); + autonuma_printk("autonuma migrate page is free\n"); + continue; +diff --git a/mm/autonuma_list.c b/mm/autonuma_list.c +new file mode 100644 +index 0000000..2c840f7 +--- /dev/null ++++ b/mm/autonuma_list.c +@@ -0,0 +1,167 @@ ++/* ++ * Copyright 2006, Red Hat, Inc., Dave Jones ++ * Copyright 2012, Red Hat, Inc. ++ * Released under the General Public License (GPL). ++ * ++ * This file contains the linked list implementations for ++ * autonuma migration lists. ++ */ ++ ++#include <linux/mm.h> ++#include <linux/autonuma.h> ++ ++/* ++ * Insert a new entry between two known consecutive entries. ++ * ++ * This is only for internal list manipulation where we know ++ * the prev/next entries already! ++ * ++ * return true if succeeded, or false if the (page_nid, pfn_offset) ++ * pair couldn't represent the pfn and the list_add didn't succeed. ++ */ ++bool __autonuma_list_add(int page_nid, ++ struct page *page, ++ struct autonuma_list_head *head, ++ autonuma_list_entry prev, ++ autonuma_list_entry next) ++{ ++ autonuma_list_entry new; ++ ++ VM_BUG_ON(page_nid != page_to_nid(page)); ++ new = autonuma_page_to_list_entry(page_nid, page); ++ if (new > AUTONUMA_LIST_MAX_PFN_OFFSET) ++ return false; ++ ++ WARN(new == prev || new == next, ++ "autonuma_list_add double add: new=%u, prev=%u, next=%u.\n", ++ new, prev, next); ++ ++ __autonuma_list_head(page_nid, head, next)->anl_prev_pfn = new; ++ __autonuma_list_head(page_nid, head, new)->anl_next_pfn = next; ++ __autonuma_list_head(page_nid, head, new)->anl_prev_pfn = prev; ++ __autonuma_list_head(page_nid, head, prev)->anl_next_pfn = new; ++ return true; ++} ++ ++static inline void __autonuma_list_del_entry(int page_nid, ++ struct autonuma_list_head *entry, ++ struct autonuma_list_head *head) ++{ ++ autonuma_list_entry prev, next; ++ ++ prev = entry->anl_prev_pfn; ++ next = entry->anl_next_pfn; ++ ++ if (WARN(next == AUTONUMA_LIST_POISON1, ++ "autonuma_list_del corruption, " ++ "%p->anl_next_pfn is AUTONUMA_LIST_POISON1 (%u)\n", ++ entry, AUTONUMA_LIST_POISON1) || ++ WARN(prev == AUTONUMA_LIST_POISON2, ++ "autonuma_list_del corruption, " ++ "%p->anl_prev_pfn is AUTONUMA_LIST_POISON2 (%u)\n", ++ entry, AUTONUMA_LIST_POISON2)) ++ return; ++ ++ __autonuma_list_head(page_nid, head, next)->anl_prev_pfn = prev; ++ __autonuma_list_head(page_nid, head, prev)->anl_next_pfn = next; ++} ++ ++/* ++ * autonuma_list_del - deletes entry from list. ++ * ++ * Note: autonuma_list_empty on entry does not return true after this, ++ * the entry is in an undefined state. ++ */ ++void autonuma_list_del(int page_nid, struct autonuma_list_head *entry, ++ struct autonuma_list_head *head) ++{ ++ __autonuma_list_del_entry(page_nid, entry, head); ++ entry->anl_next_pfn = AUTONUMA_LIST_POISON1; ++ entry->anl_prev_pfn = AUTONUMA_LIST_POISON2; ++} ++ ++/* ++ * autonuma_list_empty - tests whether a list is empty ++ * @head: the list to test. ++ */ ++bool autonuma_list_empty(const struct autonuma_list_head *head) ++{ ++ bool ret = false; ++ if (head->anl_next_pfn == AUTONUMA_LIST_HEAD) { ++ ret = true; ++ BUG_ON(head->anl_prev_pfn != AUTONUMA_LIST_HEAD); ++ } ++ return ret; ++} ++ ++/* abstraction conversion methods */ ++ ++static inline struct page *__autonuma_list_entry_to_page(int page_nid, ++ autonuma_list_entry pfn_offset) ++{ ++ struct pglist_data *pgdat = NODE_DATA(page_nid); ++ unsigned long pfn = pgdat->node_start_pfn + pfn_offset; ++ return pfn_to_page(pfn); ++} ++ ++struct page *autonuma_list_entry_to_page(int page_nid, ++ autonuma_list_entry pfn_offset) ++{ ++ VM_BUG_ON(page_nid < 0); ++ BUG_ON(pfn_offset == AUTONUMA_LIST_POISON1); ++ BUG_ON(pfn_offset == AUTONUMA_LIST_POISON2); ++ BUG_ON(pfn_offset == AUTONUMA_LIST_HEAD); ++ return __autonuma_list_entry_to_page(page_nid, pfn_offset); ++} ++ ++/* ++ * returns a value above AUTONUMA_LIST_MAX_PFN_OFFSET if the pfn is ++ * located a too big offset from the start of the node and cannot be ++ * represented by the (page_nid, pfn_offset) pair. ++ */ ++autonuma_list_entry autonuma_page_to_list_entry(int page_nid, ++ struct page *page) ++{ ++ unsigned long pfn = page_to_pfn(page); ++ struct pglist_data *pgdat = NODE_DATA(page_nid); ++ VM_BUG_ON(page_nid != page_to_nid(page)); ++ BUG_ON(pfn < pgdat->node_start_pfn); ++ pfn -= pgdat->node_start_pfn; ++ if (pfn > AUTONUMA_LIST_MAX_PFN_OFFSET) { ++ WARN_ONCE(1, "autonuma_page_to_list_entry: " ++ "pfn_offset %lu, pgdat %p, " ++ "pgdat->node_start_pfn %lu\n", ++ pfn, pgdat, pgdat->node_start_pfn); ++ /* ++ * Any value bigger than AUTONUMA_LIST_MAX_PFN_OFFSET ++ * will work as an error retval, but better pick one ++ * that will cause noise if computed wrong by the ++ * caller. ++ */ ++ return AUTONUMA_LIST_POISON1; ++ } ++ return pfn; /* convert to uint16_t without losing information */ ++} ++ ++static inline struct autonuma_list_head *____autonuma_list_head(int page_nid, ++ autonuma_list_entry pfn_offset) ++{ ++ struct pglist_data *pgdat = NODE_DATA(page_nid); ++ unsigned long pfn = pgdat->node_start_pfn + pfn_offset; ++ struct page *page = pfn_to_page(pfn); ++ struct page_autonuma *page_autonuma = lookup_page_autonuma(page); ++ return &page_autonuma->autonuma_migrate_node; ++} ++ ++struct autonuma_list_head *__autonuma_list_head(int page_nid, ++ struct autonuma_list_head *head, ++ autonuma_list_entry pfn_offset) ++{ ++ VM_BUG_ON(page_nid < 0); ++ BUG_ON(pfn_offset == AUTONUMA_LIST_POISON1); ++ BUG_ON(pfn_offset == AUTONUMA_LIST_POISON2); ++ if (pfn_offset != AUTONUMA_LIST_HEAD) ++ return ____autonuma_list_head(page_nid, pfn_offset); ++ else ++ return head; ++} +diff --git a/mm/page_autonuma.c b/mm/page_autonuma.c +index f929d81..151f25c 100644 +--- a/mm/page_autonuma.c ++++ b/mm/page_autonuma.c +@@ -12,7 +12,6 @@ void __meminit page_autonuma_map_init(struct page *page, + for (end = page + nr_pages; page < end; page++, page_autonuma++) { + page_autonuma->autonuma_last_nid = -1; + page_autonuma->autonuma_migrate_nid = -1; +- page_autonuma->page = page; + } + } + +@@ -20,12 +19,18 @@ static void __meminit __pgdat_autonuma_init(struct pglist_data *pgdat) + { + int node_iter; + ++ /* verify the per-page page_autonuma 12 byte fixed cost */ ++ BUILD_BUG_ON((unsigned long) &((struct page_autonuma *)0)[1] != 12); ++ + spin_lock_init(&pgdat->autonuma_lock); + init_waitqueue_head(&pgdat->autonuma_knuma_migrated_wait); + pgdat->autonuma_nr_migrate_pages = 0; + if (!autonuma_impossible()) +- for_each_node(node_iter) +- INIT_LIST_HEAD(&pgdat->autonuma_migrate_head[node_iter]); ++ for_each_node(node_iter) { ++ struct autonuma_list_head *head; ++ head = &pgdat->autonuma_migrate_head[node_iter]; ++ AUTONUMA_INIT_LIST_HEAD(head); ++ } + } + + #if !defined(CONFIG_SPARSEMEM) +@@ -112,10 +117,6 @@ struct page_autonuma *lookup_page_autonuma(struct page *page) + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); + +- /* if it's not a power of two we may be wasting memory */ +- BUILD_BUG_ON(SECTION_PAGE_AUTONUMA_SIZE & +- (SECTION_PAGE_AUTONUMA_SIZE-1)); +- + #ifdef CONFIG_DEBUG_VM + /* + * The sanity checks the page allocator does upon freeing a diff --git a/a/content_digest b/N1/content_digest index a2cf1f0..f4efb37 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -84,6 +84,657 @@ "details with Johannes during the MM summit but it's some work to\n" "implement it).\n" "\n" - === + "===\n" + ">From 17e1cbc02c1b41037248d9952179ff293a287d58 Mon Sep 17 00:00:00 2001\n" + "From: Andrea Arcangeli <aarcange@redhat.com>\n" + "Date: Tue, 19 Jun 2012 18:55:25 +0200\n" + "Subject: [PATCH] autonuma: shrink the per-page page_autonuma struct size\n" + "\n" + ">From 32 to 12 bytes, so the AutoNUMA memory footprint is reduced to\n" + "0.29% of RAM.\n" + "\n" + "This however will fail to migrate pages above a 16 Terabyte offset\n" + "from the start of each node (migration failure isn't fatal, simply\n" + "those pages will not follow the CPU, a warning will be printed in the\n" + "log just once in that case).\n" + "\n" + "AutoNUMA will also fail to build if there are more than (2**15)-1\n" + "nodes supported by the MAX_NUMNODES at build time (it would be easy to\n" + "relax it to (2**16)-1 nodes without increasing the memory footprint,\n" + "but it's not even worth it, so let's keep the negative space reserved\n" + "for now).\n" + "\n" + "This means the max RAM configuration fully supported by AutoNUMA\n" + "becomes AUTONUMA_LIST_MAX_PFN_OFFSET multiplied by 32767 nodes\n" + "multiplied by the PAGE_SIZE (assume 4096 here, but for some archs it's\n" + "bigger).\n" + "\n" + "4096*32767*(0xffffffff-3)>>(10*5) = 511 PetaBytes.\n" + "\n" + "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n" + "---\n" + " include/linux/autonuma_list.h | 94 ++++++++++++++++++++++\n" + " include/linux/autonuma_types.h | 48 +++++++-----\n" + " include/linux/mmzone.h | 3 +-\n" + " include/linux/page_autonuma.h | 2 +-\n" + " mm/Makefile | 2 +-\n" + " mm/autonuma.c | 75 +++++++++++++-----\n" + " mm/autonuma_list.c | 167 ++++++++++++++++++++++++++++++++++++++++\n" + " mm/page_autonuma.c | 15 ++--\n" + " 8 files changed, 355 insertions(+), 51 deletions(-)\n" + " create mode 100644 include/linux/autonuma_list.h\n" + " create mode 100644 mm/autonuma_list.c\n" + "\n" + "diff --git a/include/linux/autonuma_list.h b/include/linux/autonuma_list.h\n" + "new file mode 100644\n" + "index 0000000..0f338e9\n" + "--- /dev/null\n" + "+++ b/include/linux/autonuma_list.h\n" + "@@ -0,0 +1,94 @@\n" + "+#ifndef __AUTONUMA_LIST_H\n" + "+#define __AUTONUMA_LIST_H\n" + "+\n" + "+#include <linux/types.h>\n" + "+#include <linux/kernel.h>\n" + "+\n" + "+typedef uint32_t autonuma_list_entry;\n" + "+#define AUTONUMA_LIST_MAX_PFN_OFFSET\t(AUTONUMA_LIST_HEAD-3)\n" + "+#define AUTONUMA_LIST_POISON1\t\t(AUTONUMA_LIST_HEAD-2)\n" + "+#define AUTONUMA_LIST_POISON2\t\t(AUTONUMA_LIST_HEAD-1)\n" + "+#define AUTONUMA_LIST_HEAD\t\t((uint32_t)UINT_MAX)\n" + "+\n" + "+struct autonuma_list_head {\n" + "+\tautonuma_list_entry anl_next_pfn;\n" + "+\tautonuma_list_entry anl_prev_pfn;\n" + "+};\n" + "+\n" + "+static inline void AUTONUMA_INIT_LIST_HEAD(struct autonuma_list_head *anl)\n" + "+{\n" + "+\tanl->anl_next_pfn = AUTONUMA_LIST_HEAD;\n" + "+\tanl->anl_prev_pfn = AUTONUMA_LIST_HEAD;\n" + "+}\n" + "+\n" + "+/* abstraction conversion methods */\n" + "+extern struct page *autonuma_list_entry_to_page(int nid,\n" + "+\t\t\t\t\tautonuma_list_entry pfn_offset);\n" + "+extern autonuma_list_entry autonuma_page_to_list_entry(int page_nid,\n" + "+\t\t\t\t\t\t struct page *page);\n" + "+extern struct autonuma_list_head *__autonuma_list_head(int page_nid,\n" + "+\t\t\t\t\tstruct autonuma_list_head *head,\n" + "+\t\t\t\t\tautonuma_list_entry pfn_offset);\n" + "+\n" + "+extern bool __autonuma_list_add(int page_nid,\n" + "+\t\t\t\tstruct page *page,\n" + "+\t\t\t\tstruct autonuma_list_head *head,\n" + "+\t\t\t\tautonuma_list_entry prev,\n" + "+\t\t\t\tautonuma_list_entry next);\n" + "+\n" + "+/*\n" + "+ * autonuma_list_add - add a new entry\n" + "+ *\n" + "+ * Insert a new entry after the specified head.\n" + "+ */\n" + "+static inline bool autonuma_list_add(int page_nid,\n" + "+\t\t\t\t struct page *page,\n" + "+\t\t\t\t autonuma_list_entry entry,\n" + "+\t\t\t\t struct autonuma_list_head *head)\n" + "+{\n" + "+\tstruct autonuma_list_head *entry_head;\n" + "+\tentry_head = __autonuma_list_head(page_nid, head, entry);\n" + "+\treturn __autonuma_list_add(page_nid, page, head,\n" + "+\t\t\t\t entry, entry_head->anl_next_pfn);\n" + "+}\n" + "+\n" + "+/*\n" + "+ * autonuma_list_add_tail - add a new entry\n" + "+ *\n" + "+ * Insert a new entry before the specified head.\n" + "+ * This is useful for implementing queues.\n" + "+ */\n" + "+static inline bool autonuma_list_add_tail(int page_nid,\n" + "+\t\t\t\t\t struct page *page,\n" + "+\t\t\t\t\t autonuma_list_entry entry,\n" + "+\t\t\t\t\t struct autonuma_list_head *head)\n" + "+{\n" + "+\tstruct autonuma_list_head *entry_head;\n" + "+\tentry_head = __autonuma_list_head(page_nid, head, entry);\n" + "+\treturn __autonuma_list_add(page_nid, page, head,\n" + "+\t\t\t\t entry_head->anl_prev_pfn, entry);\n" + "+}\n" + "+\n" + "+/*\n" + "+ * autonuma_list_del - deletes entry from list.\n" + "+ * @entry: the element to delete from the list.\n" + "+ */\n" + "+extern void autonuma_list_del(int page_nid,\n" + "+\t\t\t struct autonuma_list_head *entry,\n" + "+\t\t\t struct autonuma_list_head *head);\n" + "+\n" + "+extern bool autonuma_list_empty(const struct autonuma_list_head *head);\n" + "+\n" + "+#if 0 /* not needed so far */\n" + "+/*\n" + "+ * autonuma_list_is_singular - tests whether a list has just one entry.\n" + "+ * @head: the list to test.\n" + "+ */\n" + "+static inline int autonuma_list_is_singular(const struct autonuma_list_head *head)\n" + "+{\n" + "+\treturn !autonuma_list_empty(head) &&\n" + "+\t\t(head->anl_next_pfn == head->anl_prev_pfn);\n" + "+}\n" + "+#endif\n" + "+\n" + "+#endif /* __AUTONUMA_LIST_H */\n" + "diff --git a/include/linux/autonuma_types.h b/include/linux/autonuma_types.h\n" + "index 6662990..1abde9c5 100644\n" + "--- a/include/linux/autonuma_types.h\n" + "+++ b/include/linux/autonuma_types.h\n" + "@@ -4,6 +4,7 @@\n" + " #ifdef CONFIG_AUTONUMA\n" + " \n" + " #include <linux/numa.h>\n" + "+#include <linux/autonuma_list.h>\n" + " \n" + " /*\n" + " * Per-mm (process) structure dynamically allocated only if autonuma\n" + "@@ -45,15 +46,36 @@ struct task_autonuma {\n" + " /*\n" + " * Per page (or per-pageblock) structure dynamically allocated only if\n" + " * autonuma is not impossible.\n" + "+ *\n" + "+ * This structure takes 12 bytes per page for all architectures. There\n" + "+ * are two constraints to make this work:\n" + "+ *\n" + "+ * 1) the build will abort if * MAX_NUMNODES is too big according to\n" + "+ * the #error check below\n" + "+ *\n" + "+ * 2) AutoNUMA will not succeed to insert into the migration queue any\n" + "+ * page whose pfn offset value (offset with respect to the first\n" + "+ * pfn of the node) is bigger than AUTONUMA_LIST_MAX_PFN_OFFSET\n" + "+ * (NOTE: AUTONUMA_LIST_MAX_PFN_OFFSET is still a valid pfn offset\n" + "+ * value). This means with huge node sizes and small PAGE_SIZE,\n" + "+ * some pages may not be allowed to be migrated.\n" + " */\n" + " struct page_autonuma {\n" + " \t/*\n" + " \t * To modify autonuma_last_nid lockless the architecture,\n" + " \t * needs SMP atomic granularity < sizeof(long), not all archs\n" + "-\t * have that, notably some alpha. Archs without that requires\n" + "+\t * have that, notably some ancient alpha (but none of those\n" + "+\t * should run in NUMA systems). Archs without that requires\n" + " \t * autonuma_last_nid to be a long.\n" + " \t */\n" + "-#if BITS_PER_LONG > 32\n" + "+#if MAX_NUMNODES > 32767\n" + "+\t/*\n" + "+\t * Verify at build time that int16_t for autonuma_migrate_nid\n" + "+\t * and autonuma_last_nid won't risk to overflow, max allowed\n" + "+\t * nid value is (2**15)-1.\n" + "+\t */\n" + "+#error \"too many nodes\"\n" + "+#endif\n" + " \t/*\n" + " \t * autonuma_migrate_nid is -1 if the page_autonuma structure\n" + " \t * is not linked into any\n" + "@@ -63,7 +85,7 @@ struct page_autonuma {\n" + " \t * page_nid is the nid that the page (referenced by the\n" + " \t * page_autonuma structure) belongs to.\n" + " \t */\n" + "-\tint autonuma_migrate_nid;\n" + "+\tint16_t autonuma_migrate_nid;\n" + " \t/*\n" + " \t * autonuma_last_nid records which is the NUMA nid that tried\n" + " \t * to access this page at the last NUMA hinting page fault.\n" + "@@ -72,28 +94,14 @@ struct page_autonuma {\n" + " \t * it will make different threads trashing on the same pages,\n" + " \t * converge on the same NUMA node (if possible).\n" + " \t */\n" + "-\tint autonuma_last_nid;\n" + "-#else\n" + "-#if MAX_NUMNODES >= 32768\n" + "-#error \"too many nodes\"\n" + "-#endif\n" + "-\tshort autonuma_migrate_nid;\n" + "-\tshort autonuma_last_nid;\n" + "-#endif\n" + "+\tint16_t autonuma_last_nid;\n" + "+\n" + " \t/*\n" + " \t * This is the list node that links the page (referenced by\n" + " \t * the page_autonuma structure) in the\n" + " \t * &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid] lru.\n" + " \t */\n" + "-\tstruct list_head autonuma_migrate_node;\n" + "-\n" + "-\t/*\n" + "-\t * To find the page starting from the autonuma_migrate_node we\n" + "-\t * need a backlink.\n" + "-\t *\n" + "-\t * FIXME: drop it;\n" + "-\t */\n" + "-\tstruct page *page;\n" + "+\tstruct autonuma_list_head autonuma_migrate_node;\n" + " };\n" + " \n" + " extern int alloc_task_autonuma(struct task_struct *tsk,\n" + "diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h\n" + "index ed5b0c0..acefdfa 100644\n" + "--- a/include/linux/mmzone.h\n" + "+++ b/include/linux/mmzone.h\n" + "@@ -17,6 +17,7 @@\n" + " #include <linux/pageblock-flags.h>\n" + " #include <generated/bounds.h>\n" + " #include <linux/atomic.h>\n" + "+#include <linux/autonuma_list.h>\n" + " #include <asm/page.h>\n" + " \n" + " /* Free memory management - zoned buddy allocator. */\n" + "@@ -710,7 +711,7 @@ typedef struct pglist_data {\n" + " \t * <linux/page_autonuma.h> and the below field must remain the\n" + " \t * last one of this structure.\n" + " \t */\n" + "-\tstruct list_head autonuma_migrate_head[0];\n" + "+\tstruct autonuma_list_head autonuma_migrate_head[0];\n" + " #endif\n" + " } pg_data_t;\n" + " \n" + "diff --git a/include/linux/page_autonuma.h b/include/linux/page_autonuma.h\n" + "index bc7a629..e78beda 100644\n" + "--- a/include/linux/page_autonuma.h\n" + "+++ b/include/linux/page_autonuma.h\n" + "@@ -53,7 +53,7 @@ extern void __init sparse_early_page_autonuma_alloc_node(struct page_autonuma **\n" + " /* inline won't work here */\n" + " #define autonuma_pglist_data_size() (sizeof(struct pglist_data) +\t\\\n" + " \t\t\t\t (autonuma_impossible() ? 0 :\t\\\n" + "-\t\t\t\t sizeof(struct list_head) * \\\n" + "+\t\t\t\t sizeof(struct autonuma_list_head) * \\\n" + " \t\t\t\t num_possible_nodes()))\n" + " \n" + " #endif /* _LINUX_PAGE_AUTONUMA_H */\n" + "diff --git a/mm/Makefile b/mm/Makefile\n" + "index a4d8354..4aa90d4 100644\n" + "--- a/mm/Makefile\n" + "+++ b/mm/Makefile\n" + "@@ -33,7 +33,7 @@ obj-$(CONFIG_FRONTSWAP)\t+= frontswap.o\n" + " obj-$(CONFIG_HAS_DMA)\t+= dmapool.o\n" + " obj-$(CONFIG_HUGETLBFS)\t+= hugetlb.o\n" + " obj-$(CONFIG_NUMA) \t+= mempolicy.o\n" + "-obj-$(CONFIG_AUTONUMA) \t+= autonuma.o page_autonuma.o\n" + "+obj-$(CONFIG_AUTONUMA) \t+= autonuma.o page_autonuma.o autonuma_list.o\n" + " obj-$(CONFIG_SPARSEMEM)\t+= sparse.o\n" + " obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o\n" + " obj-$(CONFIG_SLOB) += slob.o\n" + "diff --git a/mm/autonuma.c b/mm/autonuma.c\n" + "index 9834f5d..8aed9af 100644\n" + "--- a/mm/autonuma.c\n" + "+++ b/mm/autonuma.c\n" + "@@ -89,12 +89,21 @@ void autonuma_migrate_split_huge_page(struct page *page,\n" + " \tVM_BUG_ON(nid < -1);\n" + " \tVM_BUG_ON(page_tail_autonuma->autonuma_migrate_nid != -1);\n" + " \tif (nid >= 0) {\n" + "-\t\tVM_BUG_ON(page_to_nid(page) != page_to_nid(page_tail));\n" + "+\t\tint page_nid = page_to_nid(page);\n" + "+\t\tstruct autonuma_list_head *head;\n" + "+\t\tautonuma_list_entry entry;\n" + "+\t\tentry = autonuma_page_to_list_entry(page_nid, page);\n" + "+\t\thead = &NODE_DATA(nid)->autonuma_migrate_head[page_nid];\n" + "+\t\tVM_BUG_ON(page_nid != page_to_nid(page_tail));\n" + "+\t\tVM_BUG_ON(page_nid == nid);\n" + " \n" + " \t\tcompound_lock(page_tail);\n" + " \t\tautonuma_migrate_lock(nid);\n" + "-\t\tlist_add_tail(&page_tail_autonuma->autonuma_migrate_node,\n" + "-\t\t\t &page_autonuma->autonuma_migrate_node);\n" + "+\t\tif (!autonuma_list_add_tail(page_nid,\n" + "+\t\t\t\t\t page_tail,\n" + "+\t\t\t\t\t entry,\n" + "+\t\t\t\t\t head))\n" + "+\t\t\tBUG();\n" + " \t\tautonuma_migrate_unlock(nid);\n" + " \n" + " \t\tpage_tail_autonuma->autonuma_migrate_nid = nid;\n" + "@@ -119,8 +128,15 @@ void __autonuma_migrate_page_remove(struct page *page,\n" + " \tVM_BUG_ON(nid < -1);\n" + " \tif (nid >= 0) {\n" + " \t\tint numpages = hpage_nr_pages(page);\n" + "+\t\tint page_nid = page_to_nid(page);\n" + "+\t\tstruct autonuma_list_head *head;\n" + "+\t\tVM_BUG_ON(nid == page_nid);\n" + "+\t\thead = &NODE_DATA(nid)->autonuma_migrate_head[page_nid];\n" + "+\n" + " \t\tautonuma_migrate_lock(nid);\n" + "-\t\tlist_del(&page_autonuma->autonuma_migrate_node);\n" + "+\t\tautonuma_list_del(page_nid,\n" + "+\t\t\t\t &page_autonuma->autonuma_migrate_node,\n" + "+\t\t\t\t head);\n" + " \t\tNODE_DATA(nid)->autonuma_nr_migrate_pages -= numpages;\n" + " \t\tautonuma_migrate_unlock(nid);\n" + " \n" + "@@ -139,6 +155,8 @@ static void __autonuma_migrate_page_add(struct page *page,\n" + " \tint numpages;\n" + " \tunsigned long nr_migrate_pages;\n" + " \twait_queue_head_t *wait_queue;\n" + "+\tstruct autonuma_list_head *head;\n" + "+\tbool added;\n" + " \n" + " \tVM_BUG_ON(dst_nid >= MAX_NUMNODES);\n" + " \tVM_BUG_ON(dst_nid < -1);\n" + "@@ -155,25 +173,33 @@ static void __autonuma_migrate_page_add(struct page *page,\n" + " \tVM_BUG_ON(nid >= MAX_NUMNODES);\n" + " \tVM_BUG_ON(nid < -1);\n" + " \tif (nid >= 0) {\n" + "+\t\tVM_BUG_ON(nid == page_nid);\n" + "+\t\thead = &NODE_DATA(nid)->autonuma_migrate_head[page_nid];\n" + "+\n" + " \t\tautonuma_migrate_lock(nid);\n" + "-\t\tlist_del(&page_autonuma->autonuma_migrate_node);\n" + "+\t\tautonuma_list_del(page_nid,\n" + "+\t\t\t\t &page_autonuma->autonuma_migrate_node,\n" + "+\t\t\t\t head);\n" + " \t\tNODE_DATA(nid)->autonuma_nr_migrate_pages -= numpages;\n" + " \t\tautonuma_migrate_unlock(nid);\n" + " \t}\n" + " \n" + "+\thead = &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid];\n" + "+\n" + " \tautonuma_migrate_lock(dst_nid);\n" + "-\tlist_add(&page_autonuma->autonuma_migrate_node,\n" + "-\t\t &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid]);\n" + "-\tNODE_DATA(dst_nid)->autonuma_nr_migrate_pages += numpages;\n" + "+\tadded = autonuma_list_add(page_nid, page, AUTONUMA_LIST_HEAD, head);\n" + "+\tif (added)\n" + "+\t\tNODE_DATA(dst_nid)->autonuma_nr_migrate_pages += numpages;\n" + " \tnr_migrate_pages = NODE_DATA(dst_nid)->autonuma_nr_migrate_pages;\n" + " \n" + " \tautonuma_migrate_unlock(dst_nid);\n" + " \n" + "-\tpage_autonuma->autonuma_migrate_nid = dst_nid;\n" + "+\tif (added)\n" + "+\t\tpage_autonuma->autonuma_migrate_nid = dst_nid;\n" + " \n" + " \tcompound_unlock_irqrestore(page, flags);\n" + " \n" + "-\tif (!autonuma_migrate_defer()) {\n" + "+\tif (added && !autonuma_migrate_defer()) {\n" + " \t\twait_queue = &NODE_DATA(dst_nid)->autonuma_knuma_migrated_wait;\n" + " \t\tif (nr_migrate_pages >= pages_to_migrate &&\n" + " \t\t nr_migrate_pages - numpages < pages_to_migrate &&\n" + "@@ -813,7 +839,7 @@ static int isolate_migratepages(struct list_head *migratepages,\n" + " \t\t\t\tstruct pglist_data *pgdat)\n" + " {\n" + " \tint nr = 0, nid;\n" + "-\tstruct list_head *heads = pgdat->autonuma_migrate_head;\n" + "+\tstruct autonuma_list_head *heads = pgdat->autonuma_migrate_head;\n" + " \n" + " \t/* FIXME: THP balancing, restart from last nid */\n" + " \tfor_each_online_node(nid) {\n" + "@@ -825,10 +851,10 @@ static int isolate_migratepages(struct list_head *migratepages,\n" + " \t\tcond_resched();\n" + " \t\tVM_BUG_ON(numa_node_id() != pgdat->node_id);\n" + " \t\tif (nid == pgdat->node_id) {\n" + "-\t\t\tVM_BUG_ON(!list_empty(&heads[nid]));\n" + "+\t\t\tVM_BUG_ON(!autonuma_list_empty(&heads[nid]));\n" + " \t\t\tcontinue;\n" + " \t\t}\n" + "-\t\tif (list_empty(&heads[nid]))\n" + "+\t\tif (autonuma_list_empty(&heads[nid]))\n" + " \t\t\tcontinue;\n" + " \t\t/* some page wants to go to this pgdat */\n" + " \t\t/*\n" + "@@ -840,22 +866,29 @@ static int isolate_migratepages(struct list_head *migratepages,\n" + " \t\t * irqs.\n" + " \t\t */\n" + " \t\tautonuma_migrate_lock_irq(pgdat->node_id);\n" + "-\t\tif (list_empty(&heads[nid])) {\n" + "+\t\tif (autonuma_list_empty(&heads[nid])) {\n" + " \t\t\tautonuma_migrate_unlock_irq(pgdat->node_id);\n" + " \t\t\tcontinue;\n" + " \t\t}\n" + "-\t\tpage_autonuma = list_entry(heads[nid].prev,\n" + "-\t\t\t\t\t struct page_autonuma,\n" + "-\t\t\t\t\t autonuma_migrate_node);\n" + "-\t\tpage = page_autonuma->page;\n" + "+\t\tpage = autonuma_list_entry_to_page(nid,\n" + "+\t\t\t\t\t\t heads[nid].anl_prev_pfn);\n" + "+\t\tpage_autonuma = lookup_page_autonuma(page);\n" + " \t\tif (unlikely(!get_page_unless_zero(page))) {\n" + "+\t\t\tint page_nid = page_to_nid(page);\n" + "+\t\t\tstruct autonuma_list_head *entry_head;\n" + "+\t\t\tVM_BUG_ON(nid == page_nid);\n" + "+\n" + " \t\t\t/*\n" + " \t\t\t * Is getting freed and will remove self from the\n" + " \t\t\t * autonuma list shortly, skip it for now.\n" + " \t\t\t */\n" + "-\t\t\tlist_del(&page_autonuma->autonuma_migrate_node);\n" + "-\t\t\tlist_add(&page_autonuma->autonuma_migrate_node,\n" + "-\t\t\t\t &heads[nid]);\n" + "+\t\t\tentry_head = &page_autonuma->autonuma_migrate_node;\n" + "+\t\t\tautonuma_list_del(page_nid, entry_head,\n" + "+\t\t\t\t\t &heads[nid]);\n" + "+\t\t\tif (!autonuma_list_add(page_nid, page,\n" + "+\t\t\t\t\t AUTONUMA_LIST_HEAD,\n" + "+\t\t\t\t\t &heads[nid]))\n" + "+\t\t\t\tBUG();\n" + " \t\t\tautonuma_migrate_unlock_irq(pgdat->node_id);\n" + " \t\t\tautonuma_printk(\"autonuma migrate page is free\\n\");\n" + " \t\t\tcontinue;\n" + "diff --git a/mm/autonuma_list.c b/mm/autonuma_list.c\n" + "new file mode 100644\n" + "index 0000000..2c840f7\n" + "--- /dev/null\n" + "+++ b/mm/autonuma_list.c\n" + "@@ -0,0 +1,167 @@\n" + "+/*\n" + "+ * Copyright 2006, Red Hat, Inc., Dave Jones\n" + "+ * Copyright 2012, Red Hat, Inc.\n" + "+ * Released under the General Public License (GPL).\n" + "+ *\n" + "+ * This file contains the linked list implementations for\n" + "+ * autonuma migration lists.\n" + "+ */\n" + "+\n" + "+#include <linux/mm.h>\n" + "+#include <linux/autonuma.h>\n" + "+\n" + "+/*\n" + "+ * Insert a new entry between two known consecutive entries.\n" + "+ *\n" + "+ * This is only for internal list manipulation where we know\n" + "+ * the prev/next entries already!\n" + "+ *\n" + "+ * return true if succeeded, or false if the (page_nid, pfn_offset)\n" + "+ * pair couldn't represent the pfn and the list_add didn't succeed.\n" + "+ */\n" + "+bool __autonuma_list_add(int page_nid,\n" + "+\t\t\t struct page *page,\n" + "+\t\t\t struct autonuma_list_head *head,\n" + "+\t\t\t autonuma_list_entry prev,\n" + "+\t\t\t autonuma_list_entry next)\n" + "+{\n" + "+\tautonuma_list_entry new;\n" + "+\n" + "+\tVM_BUG_ON(page_nid != page_to_nid(page));\n" + "+\tnew = autonuma_page_to_list_entry(page_nid, page);\n" + "+\tif (new > AUTONUMA_LIST_MAX_PFN_OFFSET)\n" + "+\t\treturn false;\n" + "+\n" + "+\tWARN(new == prev || new == next,\n" + "+\t \"autonuma_list_add double add: new=%u, prev=%u, next=%u.\\n\",\n" + "+\t new, prev, next);\n" + "+\n" + "+\t__autonuma_list_head(page_nid, head, next)->anl_prev_pfn = new;\n" + "+\t__autonuma_list_head(page_nid, head, new)->anl_next_pfn = next;\n" + "+\t__autonuma_list_head(page_nid, head, new)->anl_prev_pfn = prev;\n" + "+\t__autonuma_list_head(page_nid, head, prev)->anl_next_pfn = new;\n" + "+\treturn true;\n" + "+}\n" + "+\n" + "+static inline void __autonuma_list_del_entry(int page_nid,\n" + "+\t\t\t\t\t struct autonuma_list_head *entry,\n" + "+\t\t\t\t\t struct autonuma_list_head *head)\n" + "+{\n" + "+\tautonuma_list_entry prev, next;\n" + "+\n" + "+\tprev = entry->anl_prev_pfn;\n" + "+\tnext = entry->anl_next_pfn;\n" + "+\n" + "+\tif (WARN(next == AUTONUMA_LIST_POISON1,\n" + "+\t\t \"autonuma_list_del corruption, \"\n" + "+\t\t \"%p->anl_next_pfn is AUTONUMA_LIST_POISON1 (%u)\\n\",\n" + "+\t\tentry, AUTONUMA_LIST_POISON1) ||\n" + "+\t WARN(prev == AUTONUMA_LIST_POISON2,\n" + "+\t\t\"autonuma_list_del corruption, \"\n" + "+\t\t \"%p->anl_prev_pfn is AUTONUMA_LIST_POISON2 (%u)\\n\",\n" + "+\t\tentry, AUTONUMA_LIST_POISON2))\n" + "+\t\treturn;\n" + "+\n" + "+\t__autonuma_list_head(page_nid, head, next)->anl_prev_pfn = prev;\n" + "+\t__autonuma_list_head(page_nid, head, prev)->anl_next_pfn = next;\n" + "+}\n" + "+\n" + "+/*\n" + "+ * autonuma_list_del - deletes entry from list.\n" + "+ *\n" + "+ * Note: autonuma_list_empty on entry does not return true after this,\n" + "+ * the entry is in an undefined state.\n" + "+ */\n" + "+void autonuma_list_del(int page_nid, struct autonuma_list_head *entry,\n" + "+\t\t struct autonuma_list_head *head)\n" + "+{\n" + "+\t__autonuma_list_del_entry(page_nid, entry, head);\n" + "+\tentry->anl_next_pfn = AUTONUMA_LIST_POISON1;\n" + "+\tentry->anl_prev_pfn = AUTONUMA_LIST_POISON2;\n" + "+}\n" + "+\n" + "+/*\n" + "+ * autonuma_list_empty - tests whether a list is empty\n" + "+ * @head: the list to test.\n" + "+ */\n" + "+bool autonuma_list_empty(const struct autonuma_list_head *head)\n" + "+{\n" + "+\tbool ret = false;\n" + "+\tif (head->anl_next_pfn == AUTONUMA_LIST_HEAD) {\n" + "+\t\tret = true;\n" + "+\t\tBUG_ON(head->anl_prev_pfn != AUTONUMA_LIST_HEAD);\n" + "+\t}\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + "+/* abstraction conversion methods */\n" + "+\n" + "+static inline struct page *__autonuma_list_entry_to_page(int page_nid,\n" + "+\t\t\t\t\t\t\t autonuma_list_entry pfn_offset)\n" + "+{\n" + "+\tstruct pglist_data *pgdat = NODE_DATA(page_nid);\n" + "+\tunsigned long pfn = pgdat->node_start_pfn + pfn_offset;\n" + "+\treturn pfn_to_page(pfn);\n" + "+}\n" + "+\n" + "+struct page *autonuma_list_entry_to_page(int page_nid,\n" + "+\t\t\t\t\t autonuma_list_entry pfn_offset)\n" + "+{\n" + "+\tVM_BUG_ON(page_nid < 0);\n" + "+\tBUG_ON(pfn_offset == AUTONUMA_LIST_POISON1);\n" + "+\tBUG_ON(pfn_offset == AUTONUMA_LIST_POISON2);\n" + "+\tBUG_ON(pfn_offset == AUTONUMA_LIST_HEAD);\n" + "+\treturn __autonuma_list_entry_to_page(page_nid, pfn_offset);\n" + "+}\n" + "+\n" + "+/*\n" + "+ * returns a value above AUTONUMA_LIST_MAX_PFN_OFFSET if the pfn is\n" + "+ * located a too big offset from the start of the node and cannot be\n" + "+ * represented by the (page_nid, pfn_offset) pair.\n" + "+ */\n" + "+autonuma_list_entry autonuma_page_to_list_entry(int page_nid,\n" + "+\t\t\t\t\t\tstruct page *page)\n" + "+{\n" + "+\tunsigned long pfn = page_to_pfn(page);\n" + "+\tstruct pglist_data *pgdat = NODE_DATA(page_nid);\n" + "+\tVM_BUG_ON(page_nid != page_to_nid(page));\n" + "+\tBUG_ON(pfn < pgdat->node_start_pfn);\n" + "+\tpfn -= pgdat->node_start_pfn;\n" + "+\tif (pfn > AUTONUMA_LIST_MAX_PFN_OFFSET) {\n" + "+\t\tWARN_ONCE(1, \"autonuma_page_to_list_entry: \"\n" + "+\t\t\t \"pfn_offset %lu, pgdat %p, \"\n" + "+\t\t\t \"pgdat->node_start_pfn %lu\\n\",\n" + "+\t\t\t pfn, pgdat, pgdat->node_start_pfn);\n" + "+\t\t/*\n" + "+\t\t * Any value bigger than AUTONUMA_LIST_MAX_PFN_OFFSET\n" + "+\t\t * will work as an error retval, but better pick one\n" + "+\t\t * that will cause noise if computed wrong by the\n" + "+\t\t * caller.\n" + "+\t\t */\n" + "+\t\treturn AUTONUMA_LIST_POISON1;\n" + "+\t}\n" + "+\treturn pfn; /* convert to uint16_t without losing information */\n" + "+}\n" + "+\n" + "+static inline struct autonuma_list_head *____autonuma_list_head(int page_nid,\n" + "+\t\t\t\t\tautonuma_list_entry pfn_offset)\n" + "+{\n" + "+\tstruct pglist_data *pgdat = NODE_DATA(page_nid);\n" + "+\tunsigned long pfn = pgdat->node_start_pfn + pfn_offset;\n" + "+\tstruct page *page = pfn_to_page(pfn);\n" + "+\tstruct page_autonuma *page_autonuma = lookup_page_autonuma(page);\n" + "+\treturn &page_autonuma->autonuma_migrate_node;\n" + "+}\n" + "+\n" + "+struct autonuma_list_head *__autonuma_list_head(int page_nid,\n" + "+\t\t\t\t\tstruct autonuma_list_head *head,\n" + "+\t\t\t\t\tautonuma_list_entry pfn_offset)\n" + "+{\n" + "+\tVM_BUG_ON(page_nid < 0);\n" + "+\tBUG_ON(pfn_offset == AUTONUMA_LIST_POISON1);\n" + "+\tBUG_ON(pfn_offset == AUTONUMA_LIST_POISON2);\n" + "+\tif (pfn_offset != AUTONUMA_LIST_HEAD)\n" + "+\t\treturn ____autonuma_list_head(page_nid, pfn_offset);\n" + "+\telse\n" + "+\t\treturn head;\n" + "+}\n" + "diff --git a/mm/page_autonuma.c b/mm/page_autonuma.c\n" + "index f929d81..151f25c 100644\n" + "--- a/mm/page_autonuma.c\n" + "+++ b/mm/page_autonuma.c\n" + "@@ -12,7 +12,6 @@ void __meminit page_autonuma_map_init(struct page *page,\n" + " \tfor (end = page + nr_pages; page < end; page++, page_autonuma++) {\n" + " \t\tpage_autonuma->autonuma_last_nid = -1;\n" + " \t\tpage_autonuma->autonuma_migrate_nid = -1;\n" + "-\t\tpage_autonuma->page = page;\n" + " \t}\n" + " }\n" + " \n" + "@@ -20,12 +19,18 @@ static void __meminit __pgdat_autonuma_init(struct pglist_data *pgdat)\n" + " {\n" + " \tint node_iter;\n" + " \n" + "+\t/* verify the per-page page_autonuma 12 byte fixed cost */\n" + "+\tBUILD_BUG_ON((unsigned long) &((struct page_autonuma *)0)[1] != 12);\n" + "+\n" + " \tspin_lock_init(&pgdat->autonuma_lock);\n" + " \tinit_waitqueue_head(&pgdat->autonuma_knuma_migrated_wait);\n" + " \tpgdat->autonuma_nr_migrate_pages = 0;\n" + " \tif (!autonuma_impossible())\n" + "-\t\tfor_each_node(node_iter)\n" + "-\t\t\tINIT_LIST_HEAD(&pgdat->autonuma_migrate_head[node_iter]);\n" + "+\t\tfor_each_node(node_iter) {\n" + "+\t\t\tstruct autonuma_list_head *head;\n" + "+\t\t\thead = &pgdat->autonuma_migrate_head[node_iter];\n" + "+\t\t\tAUTONUMA_INIT_LIST_HEAD(head);\n" + "+\t\t}\n" + " }\n" + " \n" + " #if !defined(CONFIG_SPARSEMEM)\n" + "@@ -112,10 +117,6 @@ struct page_autonuma *lookup_page_autonuma(struct page *page)\n" + " \tunsigned long pfn = page_to_pfn(page);\n" + " \tstruct mem_section *section = __pfn_to_section(pfn);\n" + " \n" + "-\t/* if it's not a power of two we may be wasting memory */\n" + "-\tBUILD_BUG_ON(SECTION_PAGE_AUTONUMA_SIZE &\n" + "-\t\t (SECTION_PAGE_AUTONUMA_SIZE-1));\n" + "-\n" + " #ifdef CONFIG_DEBUG_VM\n" + " \t/*\n" + " \t * The sanity checks the page allocator does upon freeing a" -7f220ba1c43a6d37c22388909daf9dfa12d352f7d03ea336f750748b5e4ce48f +c36af17d97bc4e141b8c77c815b3a66b504017e70b78c685b71b38654cdff5b3
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.