Re: RFC/POC Make Page Tables Relocatable Part 2 Page Table Migration Code

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* Re: RFC/POC Make Page Tables Relocatable Part 2 Page Table Migration Code
@ 2007-11-30 16:36 Ross Biro
  2007-11-30 18:04 ` Dave Hansen
  0 siblings, 1 reply; 3+ messages in thread
From: Ross Biro @ 2007-11-30 16:36 UTC (permalink / raw)
  To: Mel Gorman; +Cc: Dave Hansen, linux-mm, Mel Gorman

[-- Attachment #1: Type: text/plain, Size: 139 bytes --]

Here's the actual page table migration code.  I'm not sure I plugged
it into the correct spot, but it works well enough to test.

    Ross

[-- Attachment #2: relocate.patch --]
[-- Type: application/octet-stream, Size: 24254 bytes --]

diff -urwNbB 2.6.23/arch/i386/mm/hugetlbpage.c 2.6.23a/arch/i386/mm/hugetlbpage.c
--- 2.6.23/arch/i386/mm/hugetlbpage.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/i386/mm/hugetlbpage.c	2007-10-29 09:48:48.000000000 -0700
@@ -87,6 +87,7 @@
 		goto out;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_pud(&pud, mm, addr);
 	if (pud_none(*pud))
 		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
 	else
diff -urwNbB 2.6.23/arch/powerpc/mm/fault.c 2.6.23a/arch/powerpc/mm/fault.c
--- 2.6.23/arch/powerpc/mm/fault.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/powerpc/mm/fault.c	2007-10-29 09:38:09.000000000 -0700
@@ -301,6 +301,8 @@
 		if (get_pteptr(mm, address, &ptep, &pmdp)) {
 			spinlock_t *ptl = pte_lockptr(mm, pmdp);
 			spin_lock(ptl);
+			delimbo_pte(&ptep, &ptl, &pmdp, mm, address);
+
 			if (pte_present(*ptep)) {
 				struct page *page = pte_page(*ptep);
 
diff -urwNbB 2.6.23/arch/powerpc/mm/hugetlbpage.c 2.6.23a/arch/powerpc/mm/hugetlbpage.c
--- 2.6.23/arch/powerpc/mm/hugetlbpage.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/powerpc/mm/hugetlbpage.c	2007-10-29 09:53:36.000000000 -0700
@@ -77,6 +77,7 @@
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_hpd(&hpdp, mm, address);
 	if (!hugepd_none(*hpdp))
 		kmem_cache_free(huge_pgtable_cache, new);
 	else
diff -urwNbB 2.6.23/arch/ppc/mm/fault.c 2.6.23a/arch/ppc/mm/fault.c
--- 2.6.23/arch/ppc/mm/fault.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/arch/ppc/mm/fault.c	2007-10-29 09:38:19.000000000 -0700
@@ -219,6 +219,7 @@
 		if (get_pteptr(mm, address, &ptep, &pmdp)) {
 			spinlock_t *ptl = pte_lockptr(mm, pmdp);
 			spin_lock(ptl);
+			delimbo_pte(&ptep, &ptl, &pmdp, mm, address);
 			if (pte_present(*ptep)) {
 				struct page *page = pte_page(*ptep);
 
diff -urwNbB 2.6.23/include/asm-generic/pgtable.h 2.6.23a/include/asm-generic/pgtable.h
--- 2.6.23/include/asm-generic/pgtable.h	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/include/asm-generic/pgtable.h	2007-10-30 07:28:21.000000000 -0700
@@ -4,6 +4,8 @@
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_MMU
 
+#include <linux/sched.h>
+
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
  * Largely same as above, but only sets the access flags (dirty,
@@ -199,6 +201,45 @@
 	}
 	return 0;
 }
+
+
+/* Used to rewalk the page tables if after we grab the appropriate lock,
+   we end up with a page that's just waiting to go away. */
+static inline pgd_t *walk_page_table_pgd(struct mm_struct *mm,
+					  unsigned long addr) {
+	return pgd_offset(mm, addr);
+}
+
+static inline pud_t *walk_page_table_pud(struct mm_struct *mm,
+					 unsigned long addr) {
+	pgd_t *pgd;
+	pgd = walk_page_table_pgd(mm, addr);
+	BUG_ON(!pgd);
+	return pud_offset(pgd, addr);
+}
+
+static inline pmd_t *walk_page_table_pmd(struct mm_struct *mm,
+					 unsigned long addr) {
+	pud_t *pud;
+	pud = walk_page_table_pud(mm, addr);
+	BUG_ON(!pud);
+	return  pmd_offset(pud, addr);
+}
+
+static inline pte_t *walk_page_table_pte(struct mm_struct *mm,
+					 unsigned long addr) {
+	pmd_t *pmd;
+	pmd = walk_page_table_pmd(mm, addr);
+	BUG_ON(!pmd);
+	return pte_offset_map(pmd, addr);
+}
+
+static inline pte_t *walk_page_table_huge_pte(struct mm_struct *mm,
+					      unsigned long addr) {
+	return (pte_t *)walk_page_table_pmd(mm, addr);
+}
+
+
 #endif /* CONFIG_MMU */
 
 /*
diff -urwNbB 2.6.23/include/linux/mm.h 2.6.23a/include/linux/mm.h
--- 2.6.23/include/linux/mm.h	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/include/linux/mm.h	2007-11-30 08:05:40.000000000 -0800
@@ -14,6 +14,7 @@
 #include <linux/debug_locks.h>
 #include <linux/backing-dev.h>
 #include <linux/mm_types.h>
+#include <asm/pgtable.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -935,6 +936,7 @@
 	pte_t *__pte = pte_offset_map(pmd, address);	\
 	*(ptlp) = __ptl;				\
 	spin_lock(__ptl);				\
+	delimbo_pte(&__pte, ptlp, &pmd, mm, address);	\
 	__pte;						\
 })
 
@@ -959,6 +962,92 @@
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
+
+
+
+static inline void delimbo_pte(pte_t **pte, spinlock_t **ptl,  pmd_t **pmd,
+			  struct mm_struct *mm,
+			  unsigned long addr)
+{
+#ifdef CONFIG_SLOW_CACHE
+	while (unlikely(PageDying(pmd_page(**pmd))))
+#endif
+	{
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spin_unlock(*ptl);
+#endif
+		pte_unmap(*pte);
+		*pmd = walk_page_table_pmd(mm, addr);
+		*pte = pte_offset_map(*pmd, addr);
+		*ptl = pte_lockptr(mm, *pmd);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spin_lock(*ptl);
+#endif
+	}
+}
+
+static inline void delimbo_pte_nested(pte_t **pte, spinlock_t **ptl,
+				pmd_t **pmd,
+				struct mm_struct *mm,
+				unsigned long addr, int subclass)
+{
+#ifdef CONFIG_SLOW_CACHE
+	while (unlikely(PageDying(pmd_page(**pmd))))
+#endif
+	{
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spin_unlock(*ptl);
+#endif
+		*pmd = walk_page_table_pmd(mm, addr);
+		*pte = pte_offset_map(*pmd, addr);
+		*ptl = pte_lockptr(mm, *pmd);
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spin_lock_nested(*ptl, subclass);
+#endif
+	}
+}
+
+static inline void delimbo_pud(pud_t **pud,  struct mm_struct *mm,
+			  unsigned long addr) {
+#ifdef CONFIG_SLOW_CACHE
+	while (unlikely(PageDying(virt_to_page(*pud))))
+#endif
+	{
+		*pud = walk_page_table_pud(mm, addr);
+	}
+}
+
+static inline void delimbo_pmd(pmd_t **pmd,  struct mm_struct *mm,
+			       unsigned long addr) {
+#ifdef CONFIG_SLOW_CACHE
+	while (unlikely(PageDying(virt_to_page(*pmd))))
+#endif
+	{
+		*pmd = walk_page_table_pmd(mm, addr);
+	}
+}
+
+static inline void delimbo_pgd(pgd_t **pgd,  struct mm_struct *mm,
+			       unsigned long addr) {
+#ifdef CONFIG_SLOW_CACHE
+	while (unlikely(PageDying(virt_to_page(*pgd))))
+#endif
+	{
+		*pgd = walk_page_table_pgd(mm, addr);
+	}
+}
+
+static inline void delimbo_huge_pte(pte_t **pte,  struct mm_struct *mm,
+				    unsigned long addr) {
+#ifdef CONFIG_SLOW_CACHE
+	while (unlikely(PageDying(virt_to_page(*pte))))
+#endif
+	{
+		*pte = walk_page_table_huge_pte(mm, addr);
+	}
+}
+
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
 /*
  * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
diff -urwNbB 2.6.23/include/linux/mm_types.h 2.6.23a/include/linux/mm_types.h
--- 2.6.23/include/linux/mm_types.h	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/include/linux/mm_types.h	2007-11-13 07:43:09.000000000 -0800
@@ -5,6 +5,7 @@
 #include <linux/threads.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 struct address_space;
 
@@ -61,9 +62,13 @@
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* SLUB: freelist req. slab lock */
 	};
+
+	union {
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
 					 */
+		struct rcu_head rcu;
+	};
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
 	 * we can simply calculate the virtual address. On machines with
diff -urwNbB 2.6.23/mm/hugetlb.c 2.6.23a/mm/hugetlb.c
--- 2.6.23/mm/hugetlb.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/hugetlb.c	2007-10-30 07:32:50.000000000 -0700
@@ -379,6 +379,8 @@
 			goto nomem;
 		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
+		delimbo_huge_pte(&src_pte, src, addr);
+		delimbo_huge_pte(&dst_pte, dst, addr);
 		if (!pte_none(*src_pte)) {
 			if (cow)
 				ptep_set_wrprotect(src, addr, src_pte);
@@ -551,6 +553,7 @@
 	}
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_huge_pte(&ptep, mm, address);
 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
 	if (idx >= size)
 		goto backout;
@@ -609,6 +612,7 @@
 	ret = 0;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_huge_pte(&ptep, mm, address);
 	/* Check for a racing update before calling hugetlb_cow */
 	if (likely(pte_same(entry, *ptep)))
 		if (write_access && !pte_write(entry))
diff -urwNbB 2.6.23/mm/memory.c 2.6.23a/mm/memory.c
--- 2.6.23/mm/memory.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/memory.c	2007-11-30 08:06:17.000000000 -0800
@@ -306,6 +306,7 @@
 
 	pte_lock_init(new);
 	spin_lock(&mm->page_table_lock);
+	delimbo_pmd(&pmd, mm, address);
 	if (pmd_present(*pmd)) {	/* Another has populated it */
 		pte_lock_deinit(new);
 		pte_free(new);
@@ -325,6 +326,7 @@
 		return -ENOMEM;
 
 	spin_lock(&init_mm.page_table_lock);
+	delimbo_pmd(&pmd, &init_mm, address);
 	if (pmd_present(*pmd))		/* Another has populated it */
 		pte_free_kernel(new);
 	else
@@ -504,6 +506,8 @@
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+	delimbo_pte(&src_pte, &src_ptl, &src_pmd, src_mm, addr);
 	arch_enter_lazy_mmu_mode();
 
 	do {
@@ -1558,13 +1562,15 @@
  * and do_anonymous_page and do_no_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-				pte_t *page_table, pte_t orig_pte)
+				pte_t *page_table, pte_t orig_pte,
+				unsigned long address)
 {
 	int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
 		spinlock_t *ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
+		delimbo_pte(&page_table, &ptl, &pmd, mm, address);
 		same = pte_same(*page_table, orig_pte);
 		spin_unlock(ptl);
 	}
@@ -2153,7 +2159,7 @@
 	pte_t pte;
 	int ret = 0;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte, address))
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
@@ -2227,6 +2233,10 @@
 	}
 
 	/* No need to invalidate - it was non-present before */
+	/* Unless of course the cpu might be looking at an old
+	   copy of the pte. */
+	maybe_flush_tlb_mm(mm);
+
 	update_mmu_cache(vma, address, pte);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
@@ -2279,6 +2289,7 @@
 
 		ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
+		delimbo_pte(&page_table, &ptl, &pmd, mm, address);
 		if (!pte_none(*page_table))
 			goto release;
 		inc_mm_counter(mm, file_rss);
@@ -2288,6 +2299,10 @@
 	set_pte_at(mm, address, page_table, entry);
 
 	/* No need to invalidate - it was non-present before */
+	/* Unless of course the cpu might be looking at an old
+	   copy of the pte. */
+	maybe_flush_tlb_mm(mm);
+
 	update_mmu_cache(vma, address, entry);
 	lazy_mmu_prot_update(entry);
 unlock:
@@ -2441,6 +2456,10 @@
 		}
 
 		/* no need to invalidate: a not-present page won't be cached */
+		/* Unless of course the cpu could be looking at an old page
+		   table entry. */
+		maybe_flush_tlb_mm(mm);
+
 		update_mmu_cache(vma, address, entry);
 		lazy_mmu_prot_update(entry);
 	} else {
@@ -2544,7 +2563,7 @@
 				(write_access ? FAULT_FLAG_WRITE : 0);
 	pgoff_t pgoff;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte, address))
 		return 0;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
@@ -2603,6 +2622,7 @@
 
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
+	delimbo_pte(&pte, &ptl, &pmd, mm, address);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (write_access) {
@@ -2625,6 +2645,12 @@
 		if (write_access)
 			flush_tlb_page(vma, address);
 	}
+
+	/* if the cpu could be looking at an old page table, we need to
+	   flush out everything. */
+	maybe_flush_tlb_mm(mm);
+
+
 unlock:
 	pte_unmap_unlock(pte, ptl);
 	return 0;
@@ -2674,6 +2700,7 @@
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_pgd(&pgd, mm, address);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
 	else
@@ -2695,6 +2722,7 @@
 		return -ENOMEM;
 
 	spin_lock(&mm->page_table_lock);
+	delimbo_pud(&pud, mm, address);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
 		pmd_free(new);
diff -urwNbB 2.6.23/mm/mempolicy.c 2.6.23a/mm/mempolicy.c
--- 2.6.23/mm/mempolicy.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/mempolicy.c	2007-11-30 08:07:58.000000000 -0800
@@ -101,6 +101,11 @@
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
 
+
+int migrate_page_tables_mm(struct mm_struct *mm,  int source,
+			   new_page_t get_new_page, unsigned long private);
+
+
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
 enum zone_type policy_zone = 0;
@@ -597,6 +602,11 @@
 	return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 
+static struct page *new_node_page_page_tables(struct page *page,
+					      unsigned long node, int **x) {
+	return alloc_pages_node(node, GFP_USER, 0);
+}
+
 /*
  * Migrate pages from one node to a target node.
  * Returns error or the number of pages not migrated.
@@ -616,6 +626,10 @@
 	if (!list_empty(&pagelist))
 		err = migrate_pages(&pagelist, new_node_page, dest);
 
+	if (!err)
+		err = migrate_page_tables_mm(mm, source,
+					     new_node_page_page_tables, dest);
+
 	return err;
 }
 
@@ -671,6 +685,9 @@
  */
 
 	tmp = *from_nodes;
+	printk(KERN_INFO "from_nodes = %lX to_nodes = %lX\n",
+		*(nodes_addr(tmp)), *(nodes_addr(*to_nodes)));
+
 	while (!nodes_empty(tmp)) {
 		int s,d;
 		int source = -1;
@@ -678,6 +695,8 @@
 
 		for_each_node_mask(s, tmp) {
 			d = node_remap(s, *from_nodes, *to_nodes);
+			printk(KERN_INFO "do_migrate_pages s=%d d=%d\n",
+				s, d);
 			if (s == d)
 				continue;
 
@@ -815,6 +834,8 @@
 	unsigned long nlongs;
 	unsigned long endmask;
 
+	printk(KERN_INFO "get_nodes(%p, %p, %ld)\n", nodes, nmask, maxnode);
+
 	--maxnode;
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
@@ -850,6 +871,8 @@
 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 		return -EFAULT;
 	nodes_addr(*nodes)[nlongs-1] &= endmask;
+	printk (KERN_INFO "endmask=%lX nodes=%lX\n",
+		endmask, nodes_addr(*nodes)[nlongs-1]);
 	return 0;
 }
 
diff -urwNbB 2.6.23/mm/migrate.c 2.6.23a/mm/migrate.c
--- 2.6.23/mm/migrate.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/migrate.c	2007-11-30 08:21:25.000000000 -0800
@@ -28,9 +28,14 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
-
+#include <linux/mm.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
 #include "internal.h"
 
+int migrate_page_tables_mm(struct mm_struct *mm, int source,
+			   new_page_t get_new_page, unsigned long private);
+
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 /*
@@ -158,6 +163,7 @@
 
  	ptl = pte_lockptr(mm, pmd);
  	spin_lock(ptl);
+	delimbo_pte(&ptep, &ptl, &pmd, mm, addr);
 	pte = *ptep;
 	if (!is_swap_pte(pte))
 		goto out;
@@ -800,6 +806,8 @@
 	struct page_to_node *pp;
 	LIST_HEAD(pagelist);
 
+	printk("do_move_pages(%p, %p, %d)\n", mm, pm, migrate_all);
+
 	down_read(&mm->mmap_sem);
 
 	/*
@@ -859,9 +867,10 @@
 		err = migrate_pages(&pagelist, new_page_node,
 				(unsigned long)pm);
 	else
-		err = -ENOENT;
+		err = 0;
 
 	up_read(&mm->mmap_sem);
+
 	return err;
 }
 
@@ -915,9 +924,16 @@
 	struct mm_struct *mm;
 	struct page_to_node *pm = NULL;
 
+	printk("sys_move_pages(%d, %ld, %p, %p, %p, %d)\n",
+		pid, nr_pages, pages, nodes, status, flags);
+
 	/* Check flags */
-	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
-		return -EINVAL;
+	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) {
+		printk(KERN_INFO "sys_move_pages: bad flags: %d\n",
+			flags);
+		flags = 0;
+		/*XXXXX Fix this before submit return -EINVAL; */
+	}
 
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
@@ -932,8 +948,10 @@
 	mm = get_task_mm(task);
 	read_unlock(&tasklist_lock);
 
-	if (!mm)
+	if (!mm) {
+		printk(KERN_INFO "No Such Task\n");
 		return -EINVAL;
+	}
 
 	/*
 	 * Check if this process has the right to modify the specified
@@ -1039,3 +1057,301 @@
  	}
  	return err;
 }
+
+static void rcu_free_pt(struct rcu_head *head)
+{
+	struct page *page = container_of(head, struct page, rcu);
+	printk(KERN_INFO "rcu_free_pt freeing %p flags=%lX mapping=%p\n",
+		page, page->flags, page->mapping);
+	__free_page(page);
+}
+
+int migrate_top_level_page_table(struct mm_struct *mm, struct page *dest)
+{
+	return 1;
+#if 0
+	unsigned long flags;
+	void *dest_ptr;
+
+	/* We can't do this until we get a heavy duty tlb flush, or
+	   we can force this mm to be switched on all cpus. */
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+	dest_ptr = page_address(dest);
+	memcpy(dest_ptr, mm->pgd, PAGE_SIZE);
+	INIT_RCU_HEAD(&(virt_to_page(mm->pgd)->rcu));
+	call_rcu(&(virt_to_page(mm->pgd)->rcu), rcu_free_pt);
+	mm->pgd = (pgd_t *)dest_ptr;
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+	return 0;
+#endif
+}
+
+int migrate_pgd(pgd_t *pgd, struct mm_struct *mm,
+		unsigned long addr, struct page *dest)
+{
+	unsigned long flags;
+	void *dest_ptr;
+	pud_t *pud;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+
+	delimbo_pgd(&pgd, mm, addr);
+
+	printk("migrate_pgd *pgd=%lX\n", *(unsigned long *)pgd);
+
+	pud = pud_offset(pgd, addr);
+	dest_ptr = page_address(dest);
+	memcpy(dest_ptr, pud, PAGE_SIZE);
+	printk(KERN_INFO "migrate_pgd (%p)->flags=%lX\n",
+		pgd_page(*pgd),
+		pgd_page(*pgd)->flags);
+	INIT_RCU_HEAD(&pgd_page(*pgd)->rcu);
+	call_rcu(&(pgd_page(*pgd)->rcu), rcu_free_pt);
+	pgd_populate(mm, pgd, dest_ptr);
+
+	flush_tlb_pgtables(mm, addr,
+			   addr + (1 << PMD_SHIFT)
+			   - 1);
+
+	maybe_need_flush_mm(mm);
+
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+
+	return 0;
+
+}
+
+int migrate_pud(pud_t *pud, struct mm_struct *mm, unsigned long addr,
+		struct page *dest)
+{
+	unsigned long flags;
+	void *dest_ptr;
+	pmd_t *pmd;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+
+	delimbo_pud(&pud, mm, addr);
+	printk("migrate_pud *pud=%lX\n", *(unsigned long *)pud);
+
+	pmd = pmd_offset(pud, addr);
+
+	dest_ptr = page_address(dest);
+	memcpy(dest_ptr, pmd, PAGE_SIZE);
+
+	printk(KERN_INFO "migrate_pud (%p)->flags=%lX\n",
+		pud_page(*pud),
+		pud_page(*pud)->flags);
+	INIT_RCU_HEAD(&pud_page(*pud)->rcu);
+	call_rcu(&(pud_page(*pud)->rcu), rcu_free_pt);
+	pud_populate(mm, pud, dest_ptr);
+	flush_tlb_pgtables(mm, addr,
+			   addr + (1 << PMD_SHIFT)
+			   - 1);
+	maybe_need_flush_mm(mm);
+
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+
+	return 0;
+}
+
+
+int migrate_pmd(pmd_t *pmd, struct mm_struct *mm, unsigned long addr,
+		struct page *dest)
+{
+	unsigned long flags;
+	void *dest_ptr;
+	spinlock_t *ptl;
+	pte_t *pte;
+
+	spin_lock_irqsave(&mm->page_table_lock, flags);
+
+	delimbo_pmd(&pmd, mm, addr);
+
+	/* this could happen if the page table has been swapped out and we
+	   were looking at the old one. */
+	if (unlikely(!pmd_present(*pmd))) {
+		spin_unlock_irqrestore(&mm->page_table_lock, flags);
+		return 1;
+	}
+
+	printk("migrate_pmd *pmd = %lX\n", *(unsigned long *)pmd);
+	ptl = pte_lockptr(mm, pmd);
+
+	/* We need the page lock as well. */
+	if (ptl != &mm->page_table_lock)
+		spin_lock(ptl);
+
+	pte = pte_offset_map(pmd, addr);
+
+	dest_ptr = kmap_atomic(dest, KM_IRQ0);
+	memcpy(dest_ptr, pte, PAGE_SIZE);
+	printk(KERN_INFO "migrate_pmd (%p)->flags=%lX\n",
+		pmd_page(*pmd),
+		pmd_page(*pmd)->flags);
+	INIT_RCU_HEAD(&pmd_page(*pmd)->rcu);
+	call_rcu(&(pmd_page(*pmd)->rcu), rcu_free_pt);
+	kunmap_atomic(dest, KM_IRQ0);
+	pte_unmap(pte);
+	pte_lock_init(dest);
+	pmd_populate(NULL, pmd, dest);
+	flush_tlb_pgtables(mm, addr,
+			   addr + (1 << PMD_SHIFT)
+			   - 1);
+	maybe_need_flush_mm(mm);
+
+	if (ptl != &mm->page_table_lock)
+		spin_unlock(ptl);
+
+	spin_unlock_irqrestore(&mm->page_table_lock, flags);
+
+	return 0;
+}
+
+static int migrate_page_tables_pmd(pmd_t *pmd, struct mm_struct *mm,
+				   unsigned long *address, int source,
+				   new_page_t get_new_page,
+				   unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(pmd);
+	struct page *new_page;
+
+	if (!pmd_present(*pmd)) {
+		*address +=  (unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+		return 0;
+	}
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page)
+			return -ENOMEM;
+		pages_not_migrated += migrate_pmd(pmd, mm, *address, new_page);
+	}
+	*address +=  (unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+
+	return pages_not_migrated;
+}
+
+static int migrate_page_tables_pud(pud_t *pud, struct mm_struct *mm,
+				   unsigned long *address, int source,
+				   new_page_t get_new_page,
+				   unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int i;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(pud);
+	struct page *new_page;
+
+	if (!pud_present(*pud)) {
+		*address += (unsigned long)PTRS_PER_PMD *
+				(unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+		return 0;
+	}
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page)
+			return -ENOMEM;
+
+		pages_not_migrated += migrate_pud(pud, mm, *address, new_page);
+	}
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		int ret;
+		ret = migrate_page_tables_pmd(pmd_offset(pud, *address), mm,
+					      address, source,
+					      get_new_page, private);
+		if (ret < 0)
+			return ret;
+		pages_not_migrated += ret;
+	}
+
+	return pages_not_migrated;
+}
+
+static int migrate_page_tables_pgd(pgd_t *pgd, struct mm_struct *mm,
+				   unsigned long *address, int source,
+				   new_page_t get_new_page,
+				   unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int i;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(pgd);
+	struct page *new_page;
+
+	if (!pgd_present(*pgd)) {
+		*address +=  (unsigned long)PTRS_PER_PUD *
+				(unsigned long)PTRS_PER_PMD *
+				(unsigned long)PTRS_PER_PTE * PAGE_SIZE;
+		return 0;
+	}
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page)
+			return -ENOMEM;
+
+		pages_not_migrated += migrate_pgd(pgd, mm,  *address, new_page);
+	}
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		int ret;
+		ret = migrate_page_tables_pud(pud_offset(pgd, *address), mm,
+					      address, source,
+					      get_new_page, private);
+		if (ret < 0)
+			return ret;
+		pages_not_migrated += ret;
+	}
+	return pages_not_migrated;
+}
+
+/* similiar to migrate pages, but migrages the page tables. */
+int migrate_page_tables_mm(struct mm_struct *mm, int source,
+			   new_page_t get_new_page, unsigned long private)
+{
+	int pages_not_migrated = 0;
+	int i;
+	int *result = NULL;
+	struct page *old_page = virt_to_page(mm->pgd);
+	struct page *new_page;
+	unsigned long address = 0UL;
+
+	if (mm->pgd == NULL)
+		return 0;
+
+	if (page_to_nid(old_page) == source) {
+		new_page = get_new_page(old_page, private, &result);
+		if (!new_page) {
+			printk(KERN_INFO "get_new_page failed at top level\n");
+			return -ENOMEM;
+		}
+		pages_not_migrated += migrate_top_level_page_table(mm,
+								new_page);
+	}
+
+	for (i = 0; i < PTRS_PER_PGD && address < mm->task_size; i++) {
+		int ret;
+		printk("migrate_page_tables_mm adress=%lX *pgd = %lX\n",
+			address, *(unsigned long *)pgd_offset(mm, address));
+
+		ret = migrate_page_tables_pgd(pgd_offset(mm, address), mm,
+					      &address, source,
+					      get_new_page, private);
+		if (ret < 0) {
+			printk(KERN_INFO "migrate_page_tables_mm returning early %d\n", ret);
+			return ret;
+		}
+		pages_not_migrated += ret;
+	}
+
+	/* flush the tlbs if necessary. */
+	maybe_flush_tlb_mm(mm);
+
+	printk(KERN_INFO "migrate_page_tables_mm returning %d\n",
+		pages_not_migrated);
+
+	return pages_not_migrated;
+}
diff -urwNbB 2.6.23/mm/mremap.c 2.6.23a/mm/mremap.c
--- 2.6.23/mm/mremap.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/mremap.c	2007-10-30 06:57:49.000000000 -0700
@@ -98,6 +98,7 @@
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	delimbo_pte(&new_pte, &new_ptl, &new_pmd, mm, new_addr);
 	arch_enter_lazy_mmu_mode();
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
diff -urwNbB 2.6.23/mm/rmap.c 2.6.23a/mm/rmap.c
--- 2.6.23/mm/rmap.c	2007-10-09 13:31:38.000000000 -0700
+++ 2.6.23a/mm/rmap.c	2007-10-29 09:46:25.000000000 -0700
@@ -254,6 +254,7 @@
 
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
+	delimbo_pte(&pte, &ptl, &pmd, mm, address);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
 		return pte;

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: RFC/POC Make Page Tables Relocatable Part 2 Page Table Migration Code
  2007-11-30 16:36 RFC/POC Make Page Tables Relocatable Part 2 Page Table Migration Code Ross Biro
@ 2007-11-30 18:04 ` Dave Hansen
  2007-11-30 18:32   ` Ross Biro
  0 siblings, 1 reply; 3+ messages in thread
From: Dave Hansen @ 2007-11-30 18:04 UTC (permalink / raw)
  To: Ross Biro; +Cc: Mel Gorman, linux-mm, Mel Gorman

On Fri, 2007-11-30 at 11:36 -0500, Ross Biro wrote:
> lmbench shows the overhead of rewalking the page tables is less than
> that of spinlock debugging. 

Spinlock debugging can be pretty heavy, so I wouldn't use it as a
benchmark.  Thanks for posting them early, though.

> Here's the actual page table migration code.  I'm not sure I plugged
> it into the correct spot, but it works well enough to test.

Could you remind us exactly what you're trying to do here?  A bit of the
theory of what you're trying would be good.  Also, this is a wee bit
hard to review because it's a bit messy, still has lots of debugging
printks, and needs some CodingStyle love.  Don't forget to add -p do
your diffs while you're at it.

Where did PageDying() come from?  Where ever it came from, please wrap
it up in its header in a nice #ifdef so you don't have to do this a
number of times:

...
> +{
> +#ifdef CONFIG_SLOW_CACHE
> +       while (unlikely(PageDying(pmd_page(**pmd))))
> +#endif
> +       {
...


> +       union {
>         struct list_head lru;           /* Pageout list, eg. active_list
>                                          * protected by zone->lru_lock !
>                                          */
> +               struct rcu_head rcu;
> +       };

There's a nice shiny comment next to 'lru'.  Hint, hint. ;)

> +int migrate_top_level_page_table(struct mm_struct *mm, struct page *dest)
> +{
> +       return 1;
> +#if 0
> +       unsigned long flags;
> +       void *dest_ptr;
> +
> +       /* We can't do this until we get a heavy duty tlb flush, or
> +          we can force this mm to be switched on all cpus. */

Can you elaborate on this?  You need each cpu to do a task switch _away_
from this mm?

> +int migrate_pmd(pmd_t *pmd, struct mm_struct *mm, unsigned long addr,
> +               struct page *dest)
> +{
...
> +       pte = pte_offset_map(pmd, addr);
> +
> +       dest_ptr = kmap_atomic(dest, KM_IRQ0);

Why KM_IRQ0 here?  

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: RFC/POC Make Page Tables Relocatable Part 2 Page Table Migration Code
  2007-11-30 18:04 ` Dave Hansen
@ 2007-11-30 18:32   ` Ross Biro
  0 siblings, 0 replies; 3+ messages in thread
From: Ross Biro @ 2007-11-30 18:32 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Mel Gorman, linux-mm, Mel Gorman

On Nov 30, 2007 1:04 PM, Dave Hansen <haveblue@us.ibm.com> wrote:
> On Fri, 2007-11-30 at 11:36 -0500, Ross Biro wrote:
> > lmbench shows the overhead of rewalking the page tables is less than
> > that of spinlock debugging.
>
> Spinlock debugging can be pretty heavy, so I wouldn't use it as a
> benchmark.  Thanks for posting them early, though.

It was unintentional.  I was really excited because I saw no
performance hit from my changes.  It wasn't until the next time I hit
an uninitialized spinlock that I realized my benchmark was all but
useless.

>
> > Here's the actual page table migration code.  I'm not sure I plugged
> > it into the correct spot, but it works well enough to test.
>
> Could you remind us exactly what you're trying to do here?  A bit of the
> theory of what you're trying would be good.  Also, this is a wee bit
> hard to review because it's a bit messy, still has lots of debugging
> printks, and needs some CodingStyle love.  Don't forget to add -p do
> your diffs while you're at it.

Sorry about that.  I rushed these out so they wouldn't sit around for
a couple of weeks before I could get them out.

The goal is to make page tables relocatable.  Right now, I'm only
trying to relocate the page tables when moving a process from one node
to another in a numa system. However, the same code should work just
as well to move page tables around in a node to free up larger blocks
of memory.

> Where did PageDying() come from?  Where ever it came from, please wrap
> it up in its header in a nice #ifdef so you don't have to do this a
> number of times:

It's left over cruft from an optimization that I realized would only
be an optimization if the cache was really slow.  I thought I had
eliminated it.  Just ignore it for now.  I'll delete it everywhere.

> There's a nice shiny comment next to 'lru'.  Hint, hint. ;)

Like I said, rushed for preview.

>
> > +int migrate_top_level_page_table(struct mm_struct *mm, struct page *dest)
> > +{
> > +       return 1;
> > +#if 0
> > +       unsigned long flags;
> > +       void *dest_ptr;
> > +
> > +       /* We can't do this until we get a heavy duty tlb flush, or
> > +          we can force this mm to be switched on all cpus. */
>
> Can you elaborate on this?  You need each cpu to do a task switch _away_
> from this mm?

Switching away is sufficient, but you can do a little better by just
reloading the appropriate registers from the mm.  For example on
X86_64, an mm flush is accomplished by the equivalent of mov cr3, cr3
(I think it's cr3).  We need a reload of cr3 from the mm struct.
Currently the only code that does that is the task switch code.

>
> > +int migrate_pmd(pmd_t *pmd, struct mm_struct *mm, unsigned long addr,
> > +               struct page *dest)
> > +{
> ...
> > +       pte = pte_offset_map(pmd, addr);
> > +
> > +       dest_ptr = kmap_atomic(dest, KM_IRQ0);
>
> Why KM_IRQ0 here?

Laziness.  I needed a mapping and irq0 is safe.  Something better
should be chosen before the code is ready to go in.

    Ross

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2007-11-30 18:32 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-30 16:36 RFC/POC Make Page Tables Relocatable Part 2 Page Table Migration Code Ross Biro
2007-11-30 18:04 ` Dave Hansen
2007-11-30 18:32   ` Ross Biro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).