All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] export NUMA allocation fragmentation
@ 2003-01-31  3:39 Dave Hansen
  0 siblings, 0 replies; only message in thread
From: Dave Hansen @ 2003-01-31  3:39 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-mm

[-- Attachment #1: Type: text/plain, Size: 1317 bytes --]

The NUMA memory allocation support attempts to allocate pages close to
the CPUs that it is currently running on.  We have a hard time
determining how effective these strategies have been, or how fragmented
the allocations might get if a process is bounced around between nodes.
 This patch adds a new /proc/<pid> entry: nodepages.

It walks the process's vm_area_structs for all vaddr ranges, then
examines the ptes to determine on which node each virtual address
physically resides.

I'm a little worried about just taking the pte from __follow_page() and
dumping it into pte_pfn().  Is there something I should be testing for,
before I feed it along?

I've tested it on both NUMA and non-NUMA systems (see the pfn_to_nid()
changes).  The below are from a 4-quad 16-proc NUMAQ.

This is a process that allocates, then faults in a 256MB chunk of
memory, bound to CPU 4 (node 1).
curly:~# cat /proc/378/nodepages
Node 0 pages: 369
Node 1 pages: 65571
Node 2 pages: 0
Node 3 pages: 0

Here is the same thing, bound to CPU12 (node 3), probably forked on node
1, before it was bound.
Node 0 pages: 369
Node 1 pages: 2
Node 2 pages: 0
Node 3 pages: 65569

I would imagine that the pages on node 0 are from libc, which was
originally mapped on node 0.  The other processes inherit this.
-- 
Dave Hansen
haveblue@us.ibm.com

[-- Attachment #2: proc-pid-nodepages-2.5.59-mjb2-1.patch --]
[-- Type: text/plain, Size: 4759 bytes --]

diff -ru linux-2.5.59-mjb2-clean/fs/proc/base.c linux-2.5.59-mjb2-vma-stat/fs/proc/base.c
--- linux-2.5.59-mjb2-clean/fs/proc/base.c	Wed Jan 29 19:02:49 2003
+++ linux-2.5.59-mjb2-vma-stat/fs/proc/base.c	Thu Jan 30 17:57:51 2003
@@ -45,6 +45,7 @@
 enum pid_directory_inos {
 	PROC_PID_INO = 2,
 	PROC_PID_STATUS,
+	PROC_PID_NODE_PAGES,
 	PROC_PID_MEM,
 	PROC_PID_CWD,
 	PROC_PID_ROOT,
@@ -72,6 +73,7 @@
   E(PROC_PID_FD,	"fd",		S_IFDIR|S_IRUSR|S_IXUSR),
   E(PROC_PID_ENVIRON,	"environ",	S_IFREG|S_IRUSR),
   E(PROC_PID_STATUS,	"status",	S_IFREG|S_IRUGO),
+  E(PROC_PID_NODE_PAGES,"nodepages",	S_IFREG|S_IRUGO),
   E(PROC_PID_CMDLINE,	"cmdline",	S_IFREG|S_IRUGO),
   E(PROC_PID_STAT,	"stat",		S_IFREG|S_IRUGO),
   E(PROC_PID_STATM,	"statm",	S_IFREG|S_IRUGO),
@@ -102,6 +104,7 @@
 int proc_pid_status(struct task_struct*,char*);
 int proc_pid_statm(struct task_struct*,char*);
 int proc_pid_cpu(struct task_struct*,char*);
+int proc_pid_nodepages(struct task_struct*,char*);
 
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
@@ -1012,6 +1015,10 @@
 		case PROC_PID_STATUS:
 			inode->i_fop = &proc_info_file_operations;
 			ei->op.proc_read = proc_pid_status;
+			break;
+		case PROC_PID_NODE_PAGES:
+			inode->i_fop = &proc_info_file_operations;
+			ei->op.proc_read = proc_pid_nodepages;
 			break;
 		case PROC_PID_STAT:
 			inode->i_fop = &proc_info_file_operations;
diff -ru linux-2.5.59-mjb2-clean/fs/proc/task_mmu.c linux-2.5.59-mjb2-vma-stat/fs/proc/task_mmu.c
--- linux-2.5.59-mjb2-clean/fs/proc/task_mmu.c	Wed Jan 29 19:02:49 2003
+++ linux-2.5.59-mjb2-vma-stat/fs/proc/task_mmu.c	Thu Jan 30 19:25:54 2003
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <asm/uaccess.h>
+#include <asm/mmzone.h>
 
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
@@ -243,5 +244,56 @@
 out_free1:
 	free_page((unsigned long)kbuf);
 out:
+	return retval;
+}
+
+extern pte_t
+__follow_page(struct mm_struct *mm, unsigned long address);
+
+ssize_t proc_pid_nodepages(struct task_struct *task, char* buf)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct * map;
+	long retval;
+	int nids[MAX_NR_NODES];
+	int i;
+
+	for(i=0;i<numnodes;i++)
+		nids[i] = 0;
+	
+	/*
+	 * We might sleep getting the page, so get it first.
+	 */
+	mm = get_task_mm(task);
+
+	if(!mm) {
+		printk("%s(): !mm !!\n", __FUNCTION__);
+		return 0;
+	}
+	
+	retval = 0;
+
+	down_read(&mm->mmap_sem);
+	map = mm->mmap;
+	while (map) {
+		unsigned long vaddr = map->vm_start;
+		unsigned long vm_end = map->vm_end;
+		pte_t pte;
+		unsigned long pfn;
+		
+		for(;vaddr < vm_end; vaddr += PAGE_SIZE) {
+			pte = __follow_page(mm, vaddr);
+			pfn = pte_pfn(pte);
+			nids[pfn_to_nid(pfn)]++;
+		}
+		map = map->vm_next;
+	}
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	for(i=0;i<numnodes;i++) {
+		retval += sprintf(&buf[retval], "Node %d pages: %d\n", 
+				i, nids[i]);
+	}
 	return retval;
 }
diff -ru linux-2.5.59-mjb2-clean/include/asm-i386/mmzone.h linux-2.5.59-mjb2-vma-stat/include/asm-i386/mmzone.h
--- linux-2.5.59-mjb2-clean/include/asm-i386/mmzone.h	Wed Jan 29 19:02:38 2003
+++ linux-2.5.59-mjb2-vma-stat/include/asm-i386/mmzone.h	Thu Jan 30 19:25:54 2003
@@ -8,14 +8,17 @@
 
 #include <asm/smp.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_DISCONTIGMEM
+
+#define pfn_to_nid(pfn)		(0)
+
+#else
 
 #ifdef CONFIG_X86_NUMAQ
 #include <asm/numaq.h>
 #elif CONFIG_X86_SUMMIT
 #include <asm/srat.h>
 #else
-#define pfn_to_nid(pfn)		(0)
 #endif /* CONFIG_X86_NUMAQ */
 
 extern struct pglist_data *node_data[];
diff -ru linux-2.5.59-mjb2-clean/mm/memory.c linux-2.5.59-mjb2-vma-stat/mm/memory.c
--- linux-2.5.59-mjb2-clean/mm/memory.c	Wed Jan 29 19:02:54 2003
+++ linux-2.5.59-mjb2-vma-stat/mm/memory.c	Thu Jan 30 16:45:55 2003
@@ -612,13 +612,12 @@
  * Do a quick page-table lookup for a single page.
  * mm->page_table_lock must be held.
  */
-struct page *
-follow_page(struct mm_struct *mm, unsigned long address, int write) 
+pte_t 
+__follow_page(struct mm_struct *mm, unsigned long address)
 {
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
-	unsigned long pfn;
 
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || pgd_bad(*pgd))
@@ -629,11 +628,25 @@
 		goto out;
 
 	ptep = pte_offset_map(pmd, address);
-	if (!ptep)
+	if (!ptep) {
+		pte.pte_low = 0; //__bad_page();		
+		pte.pte_high = 0;
 		goto out;
-
+	}
 	pte = *ptep;
 	pte_unmap(ptep);
+
+out:
+	return pte;
+}
+	
+struct page *
+follow_page(struct mm_struct *mm, unsigned long address, int write) 
+{
+	pte_t pte;	
+	unsigned long pfn;
+	
+	pte = __follow_page(mm, address);
 	if (pte_present(pte)) {
 		if (!write || (pte_write(pte) && pte_dirty(pte))) {
 			pfn = pte_pfn(pte);
@@ -642,7 +655,6 @@
 		}
 	}
 
-out:
 	return NULL;
 }
 

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2003-01-31  3:39 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-01-31  3:39 [PATCH] export NUMA allocation fragmentation Dave Hansen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.