public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
* removing mm->rss and mm->anon_rss from kernel?
       [not found]             ` <418C55A7.9030100@yahoo.com.au>
@ 2004-11-06  9:28               ` Christoph Lameter
       [not found]                 ` <204290000.1099754257@[10.10.2.4]>
                                   ` (7 more replies)
  0 siblings, 8 replies; 33+ messages in thread
From: Christoph Lameter @ 2004-11-06  9:28 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

My page scalability patches need to make rss atomic and now with the
addition of anon_rss I would also have to make that atomic.

But when I looked at the code I found that the only significant use of
both is in for proc statistics. There are 3 other uses in mm/rmap.c where
the use of mm->rss may be replaced by mm->total_vm.

So I removed all uses of mm->rss and anon_rss from the kernel and
introduced a bean counter count_vm() that is only run when the
corresponding /proc file is used. count_vm then runs throught the vm
and counts all the page types. This could also add additional page types to our
statistics and solve some of the consistency issues.

The patch is by no means perfect. If you think this is worth pursuing then
I will finish the support for other archs and deal with the locking
issues etc. This patch may also remove hot spot issues that may arise with
the use of these two variables and so is of interest to us.

But a kernel with this patch boots fine and the statistics in /proc look
still okay (its late though....)

Index: linux-2.6.9/kernel/fork.c
=================================--- linux-2.6.9.orig/kernel/fork.c	2004-11-03 13:36:35.000000000 -0800
+++ linux-2.6.9/kernel/fork.c	2004-11-05 18:09:53.000000000 -0800
@@ -172,8 +172,6 @@
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->map_count = 0;
-	mm->rss = 0;
-	mm->anon_rss = 0;
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
Index: linux-2.6.9/include/linux/sched.h
=================================--- linux-2.6.9.orig/include/linux/sched.h	2004-11-03 13:36:35.000000000 -0800
+++ linux-2.6.9/include/linux/sched.h	2004-11-05 18:09:33.000000000 -0800
@@ -216,7 +216,7 @@
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
 	int map_count;				/* number of VMAs */
 	struct rw_semaphore mmap_sem;
-	spinlock_t page_table_lock;		/* Protects page tables, mm->rss, mm->anon_rss */
+	spinlock_t page_table_lock;		/* Protects page tables */

 	struct list_head mmlist;		/* List of maybe swapped mm's.  These are globally strung
 						 * together off init_mm.mmlist, and are protected
@@ -226,7 +226,7 @@
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
-	unsigned long rss, anon_rss, total_vm, locked_vm, shared_vm;
+	unsigned long total_vm, locked_vm, shared_vm;
 	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;

 	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
Index: linux-2.6.9/fs/proc/task_mmu.c
=================================--- linux-2.6.9.orig/fs/proc/task_mmu.c	2004-11-03 13:36:34.000000000 -0800
+++ linux-2.6.9/fs/proc/task_mmu.c	2004-11-06 00:41:11.000000000 -0800
@@ -7,10 +7,13 @@
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data, text, lib;
+	struct vm_count c;
+

 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	count_vm(mm, &c);
 	buffer += sprintf(buffer,
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
@@ -22,7 +25,7 @@
 		"VmPTE:\t%8lu kB\n",
 		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		mm->rss << (PAGE_SHIFT-10),
+		c.resident << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -37,11 +40,14 @@
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = mm->rss - mm->anon_rss;
+	struct vm_count c;
+
+	count_vm(mm, &c);
+	*shared = c.shared;
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = mm->rss;
+	*resident = c.resident;
 	return mm->total_vm;
 }

Index: linux-2.6.9/mm/mmap.c
=================================--- linux-2.6.9.orig/mm/mmap.c	2004-11-03 13:36:36.000000000 -0800
+++ linux-2.6.9/mm/mmap.c	2004-11-05 18:12:00.000000000 -0800
@@ -1850,7 +1850,6 @@
 	vma = mm->mmap;
 	mm->mmap = mm->mmap_cache = NULL;
 	mm->mm_rb = RB_ROOT;
-	mm->rss = 0;
 	mm->total_vm = 0;
 	mm->locked_vm = 0;

Index: linux-2.6.9/include/asm-generic/tlb.h
=================================--- linux-2.6.9.orig/include/asm-generic/tlb.h	2004-10-18 14:53:05.000000000 -0700
+++ linux-2.6.9/include/asm-generic/tlb.h	2004-11-06 01:12:10.000000000 -0800
@@ -86,13 +86,6 @@
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	int freed = tlb->freed;
-	struct mm_struct *mm = tlb->mm;
-	int rss = mm->rss;
-
-	if (rss < freed)
-		freed = rss;
-	mm->rss = rss - freed;
 	tlb_flush_mmu(tlb, start, end);

 	/* keep the page table cache within bounds */
Index: linux-2.6.9/fs/exec.c
=================================--- linux-2.6.9.orig/fs/exec.c	2004-11-03 13:36:34.000000000 -0800
+++ linux-2.6.9/fs/exec.c	2004-11-05 18:19:42.000000000 -0800
@@ -320,7 +320,6 @@
 		pte_unmap(pte);
 		goto out;
 	}
-	mm->rss++;
 	lru_cache_add_active(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
Index: linux-2.6.9/fs/binfmt_flat.c
=================================--- linux-2.6.9.orig/fs/binfmt_flat.c	2004-11-03 13:36:29.000000000 -0800
+++ linux-2.6.9/fs/binfmt_flat.c	2004-11-05 18:19:27.000000000 -0800
@@ -650,7 +650,6 @@
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
 		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
-		current->mm->rss = 0;
 	}

 	if (flags & FLAT_FLAG_KTRACE)
Index: linux-2.6.9/mm/memory.c
=================================--- linux-2.6.9.orig/mm/memory.c	2004-11-03 13:36:36.000000000 -0800
+++ linux-2.6.9/mm/memory.c	2004-11-06 01:10:19.000000000 -0800
@@ -333,9 +333,6 @@
 					pte = pte_mkclean(pte);
 				pte = pte_mkold(pte);
 				get_page(page);
-				dst->rss++;
-				if (PageAnon(page))
-					dst->anon_rss++;
 				set_pte(dst_pte, pte);
 				page_dup_rmap(page);
 cont_copy_pte_range_noset:
@@ -426,8 +423,6 @@
 				set_pte(ptep, pgoff_to_pte(page->index));
 			if (pte_dirty(pte))
 				set_page_dirty(page);
-			if (PageAnon(page))
-				tlb->mm->anon_rss--;
 			else if (pte_young(pte))
 				mark_page_accessed(page);
 			tlb->freed++;
@@ -1113,11 +1108,7 @@
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, pte))) {
-		if (PageAnon(old_page))
-			mm->anon_rss--;
-		if (PageReserved(old_page))
-			++mm->rss;
-		else
+		if (!PageReserved(old_page))
 			page_remove_rmap(old_page);
 		break_cow(vma, new_page, address, page_table);
 		lru_cache_add_active(new_page);
@@ -1398,7 +1389,6 @@
 	if (vm_swap_full())
 		remove_exclusive_swap_page(page);

-	mm->rss++;
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1463,7 +1453,6 @@
 			spin_unlock(&mm->page_table_lock);
 			goto out;
 		}
-		mm->rss++;
 		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
 							 vma->vm_page_prot)),
 				      vma);
@@ -1571,8 +1560,6 @@
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
-		if (!PageReserved(new_page))
-			++mm->rss;
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
@@ -1851,3 +1838,108 @@
 }

 #endif
+
+static void count_pte_range(pmd_t *pmd,
+	unsigned long address, unsigned long size,
+	struct vm_count *c)
+{
+        unsigned long offset;
+        pte_t *ptep;
+
+        if (pmd_none(*pmd))
+                return;
+        if (unlikely(pmd_bad(*pmd)))
+                return;
+        ptep = pte_offset_map(pmd, address);
+        offset = address & ~PMD_MASK;
+        if (offset + size > PMD_SIZE)
+                size = PMD_SIZE - offset;
+        size &= PAGE_MASK;
+        for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
+                pte_t pte = *ptep;
+
+		if (pte_none(pte))
+                        continue;
+
+                if (pte_present(pte)) {
+                        struct page *page = NULL;
+                        unsigned long pfn = pte_pfn(pte);
+
+			c->present++;
+                        if (pfn_valid(pfn)) {
+                                page = pfn_to_page(pfn);
+                                if (PageReserved(page))
+                                        c->reserved++;
+				else
+				if (page_mapped(page) > 1)
+					c->shared++;
+				else
+				if (page_mapped(page) = 1)
+					c->resident++;
+
+				if (PageLocked(page))
+					c->locked++;
+                        }
+                        if (unlikely(!page))
+                                continue;
+                        if (pte_dirty(pte))
+                                c->dirty++;
+                        else if (pte_young(pte))
+                                c->young++;
+                } else {
+	                if (pte_file(pte))
+				c->file++;
+			else
+				c->swap++;
+		}
+        }
+}
+
+static void count_pmd_range(pgd_t *dir,
+	unsigned long address, unsigned long end, struct vm_count *c)
+{
+	pmd_t * pmd;
+
+        if (pgd_none(*dir))
+                return;
+        if (unlikely(pgd_bad(*dir)))
+                return;
+
+	pmd = pmd_offset(dir, address);
+
+	if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
+                end = ((address + PGDIR_SIZE) & PGDIR_MASK);
+        do {
+                count_pte_range(pmd, address, end - address, c);
+                address = (address + PMD_SIZE) & PMD_MASK;
+                pmd++;
+        } while (address && address < end);
+}
+
+static void count_vma(struct vm_area_struct *vma, struct vm_count *c)
+{
+	unsigned long address = vma->vm_start;
+	unsigned long end = vma->vm_end;
+	pgd_t * dir = pgd_offset(vma->vm_mm, address);
+
+	do {
+		count_pmd_range(dir, address, end, c);
+		address = (address + PGDIR_SIZE) & PGDIR_MASK;
+		dir++;
+	} while (address && address < end);
+}
+
+void count_vm(struct mm_struct *mm, struct vm_count *c)
+{
+	struct vm_area_struct *vma;
+
+	memset(c, 0,sizeof(struct vm_count));
+
+	for(vma = mm->mmap; vma; vma = vma->vm_next)
+		if (is_vm_hugetlb_page(vma)) {
+			printk(KERN_WARNING "hugetlb scans not supported.\n");
+		} else
+			count_vma(vma, c);
+}
+
+
Index: linux-2.6.9/fs/binfmt_som.c
=================================--- linux-2.6.9.orig/fs/binfmt_som.c	2004-10-18 14:53:51.000000000 -0700
+++ linux-2.6.9/fs/binfmt_som.c	2004-11-05 18:19:54.000000000 -0800
@@ -259,7 +259,6 @@
 	create_som_tables(bprm);

 	current->mm->start_stack = bprm->p;
-	current->mm->rss = 0;

 #if 0
 	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
Index: linux-2.6.9/mm/fremap.c
=================================--- linux-2.6.9.orig/mm/fremap.c	2004-11-03 13:36:30.000000000 -0800
+++ linux-2.6.9/mm/fremap.c	2004-11-05 18:11:46.000000000 -0800
@@ -39,7 +39,6 @@
 					set_page_dirty(page);
 				page_remove_rmap(page);
 				page_cache_release(page);
-				mm->rss--;
 			}
 		}
 	} else {
@@ -87,7 +86,6 @@

 	zap_pte(mm, vma, addr, pte);

-	mm->rss++;
 	flush_icache_page(vma, page);
 	set_pte(pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
Index: linux-2.6.9/mm/swapfile.c
=================================--- linux-2.6.9.orig/mm/swapfile.c	2004-11-03 13:36:36.000000000 -0800
+++ linux-2.6.9/mm/swapfile.c	2004-11-05 18:13:56.000000000 -0800
@@ -431,7 +431,6 @@
 unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
 	swp_entry_t entry, struct page *page)
 {
-	vma->vm_mm->rss++;
 	get_page(page);
 	set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
 	page_add_anon_rmap(page, vma, address);
Index: linux-2.6.9/fs/binfmt_aout.c
=================================--- linux-2.6.9.orig/fs/binfmt_aout.c	2004-11-03 13:36:29.000000000 -0800
+++ linux-2.6.9/fs/binfmt_aout.c	2004-11-05 18:19:16.000000000 -0800
@@ -309,7 +309,6 @@
 		(current->mm->start_brk = N_BSSADDR(ex));
 	current->mm->free_area_cache = current->mm->mmap_base;

-	current->mm->rss = 0;
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
Index: linux-2.6.9/arch/ia64/mm/hugetlbpage.c
=================================--- linux-2.6.9.orig/arch/ia64/mm/hugetlbpage.c	2004-10-18 14:54:27.000000000 -0700
+++ linux-2.6.9/arch/ia64/mm/hugetlbpage.c	2004-11-05 18:17:34.000000000 -0800
@@ -65,7 +65,6 @@
 {
 	pte_t entry;

-	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
 	if (write_access) {
 		entry  		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
@@ -108,7 +107,6 @@
 		ptepage = pte_page(entry);
 		get_page(ptepage);
 		set_pte(dst_pte, entry);
-		dst->rss += (HPAGE_SIZE / PAGE_SIZE);
 		addr += HPAGE_SIZE;
 	}
 	return 0;
@@ -249,7 +247,6 @@
 		put_page(page);
 		pte_clear(pte);
 	}
-	mm->rss -= (end - start) >> PAGE_SHIFT;
 	flush_tlb_range(vma, start, end);
 }

Index: linux-2.6.9/fs/proc/array.c
=================================--- linux-2.6.9.orig/fs/proc/array.c	2004-11-03 13:36:29.000000000 -0800
+++ linux-2.6.9/fs/proc/array.c	2004-11-06 00:43:28.000000000 -0800
@@ -317,6 +317,7 @@
 	unsigned long rsslim = 0;
 	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
+	struct vm_count c;

 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
@@ -394,6 +395,9 @@
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);

+	if (mm)
+		count_vm(mm, &c);
+
 	res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
 %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
@@ -420,7 +424,7 @@
 		jiffies_to_clock_t(task->it_real_value),
 		start_time,
 		vsize,
-		mm ? mm->rss : 0, /* you might want to shift this left 3 */
+		mm ? c.resident : 0, /* you might want to shift this left 3 */
 	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
Index: linux-2.6.9/arch/i386/mm/hugetlbpage.c
=================================--- linux-2.6.9.orig/arch/i386/mm/hugetlbpage.c	2004-11-03 13:36:31.000000000 -0800
+++ linux-2.6.9/arch/i386/mm/hugetlbpage.c	2004-11-05 18:18:05.000000000 -0800
@@ -42,7 +42,6 @@
 {
 	pte_t entry;

-	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
 	if (write_access) {
 		entry  		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
@@ -82,7 +81,6 @@
 		ptepage = pte_page(entry);
 		get_page(ptepage);
 		set_pte(dst_pte, entry);
-		dst->rss += (HPAGE_SIZE / PAGE_SIZE);
 		addr += HPAGE_SIZE;
 	}
 	return 0;
@@ -218,7 +216,6 @@
 		page = pte_page(pte);
 		put_page(page);
 	}
-	mm->rss -= (end - start) >> PAGE_SHIFT;
 	flush_tlb_range(vma, start, end);
 }

Index: linux-2.6.9/fs/binfmt_elf.c
=================================--- linux-2.6.9.orig/fs/binfmt_elf.c	2004-11-03 13:36:29.000000000 -0800
+++ linux-2.6.9/fs/binfmt_elf.c	2004-11-05 18:18:53.000000000 -0800
@@ -716,7 +716,6 @@

 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	current->mm->rss = 0;
 	current->mm->free_area_cache = current->mm->mmap_base;
 	retval = setup_arg_pages(bprm, executable_stack);
 	if (retval < 0) {
Index: linux-2.6.9/include/asm-ia64/tlb.h
=================================--- linux-2.6.9.orig/include/asm-ia64/tlb.h	2004-10-18 14:53:51.000000000 -0700
+++ linux-2.6.9/include/asm-ia64/tlb.h	2004-11-06 00:33:14.000000000 -0800
@@ -159,13 +159,6 @@
 static inline void
 tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	unsigned long freed = tlb->freed;
-	struct mm_struct *mm = tlb->mm;
-	unsigned long rss = mm->rss;
-
-	if (rss < freed)
-		freed = rss;
-	mm->rss = rss - freed;
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
Index: linux-2.6.9/include/linux/mm.h
=================================--- linux-2.6.9.orig/include/linux/mm.h	2004-11-03 13:36:35.000000000 -0800
+++ linux-2.6.9/include/linux/mm.h	2004-11-06 00:28:06.000000000 -0800
@@ -792,6 +792,22 @@
 							-vma_pages(vma));
 }

+/* Statistics on pages used in a vm */
+
+struct vm_count {
+	unsigned long shared;
+	unsigned long resident;
+	unsigned long swap;
+	unsigned long file;
+	unsigned long present;
+	unsigned long reserved;
+	unsigned long dirty;
+	unsigned long locked;
+	unsigned long young;
+};
+
+extern void count_vm(struct mm_struct *mm, struct vm_count *c);
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
       [not found]                 ` <204290000.1099754257@[10.10.2.4]>
@ 2004-11-06 16:19                   ` Christoph Lameter
  2004-11-06 20:05                     ` William Lee Irwin III
  2004-11-07 16:11                     ` Martin J. Bligh
  0 siblings, 2 replies; 33+ messages in thread
From: Christoph Lameter @ 2004-11-06 16:19 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Nick Piggin, Benjamin Herrenschmidt, Hugh Dickins, linux-mm,
	linux-ia64

On Sat, 6 Nov 2004, Martin J. Bligh wrote:

> > So I removed all uses of mm->rss and anon_rss from the kernel and
> > introduced a bean counter count_vm() that is only run when the
> > corresponding /proc file is used. count_vm then runs throught the vm
> > and counts all the page types. This could also add additional page types to our
> > statistics and solve some of the consistency issues.
>
> I would've thought SGI would be more worried about this kind of thing
> than anyone else ... what's going to happen when you type 'ps' on a large
> box, and it does this for 10,000 processes?

Yes but I think this is preferable because of the generally faster
operations of the vm without having to continually update statistics. And
these statistics seem to be quite difficult to properly generate (why else
introduce anon_rss). Without the counters other optimizations are easier
to do.

Doing a ps is not a frequent event. Of course this may cause
significant load if one does regularly access /proc entities then. Are
there any threads from the past with some numbers of what the impact was
when we calculated rss via proc?

> If you want to make it quicker, how about doing per-cpu stats, and totalling
> them at runtime, which'd be lockless, instead of all the atomic ops?

That has its own complications and would require lots of memory with
systems that already have up to 10k cpus.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06 16:19                   ` Christoph Lameter
@ 2004-11-06 20:05                     ` William Lee Irwin III
  2004-11-07 16:11                     ` Martin J. Bligh
  1 sibling, 0 replies; 33+ messages in thread
From: William Lee Irwin III @ 2004-11-06 20:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Benjamin Herrenschmidt, Hugh Dickins, linux-mm,
	linux-ia64

On Sat, Nov 06, 2004 at 08:19:55AM -0800, Christoph Lameter wrote:
> Yes but I think this is preferable because of the generally faster
> operations of the vm without having to continually update statistics. And
> these statistics seem to be quite difficult to properly generate (why else
> introduce anon_rss). Without the counters other optimizations are easier
> to do.
> Doing a ps is not a frequent event. Of course this may cause
> significant load if one does regularly access /proc entities then. Are
> there any threads from the past with some numbers of what the impact was
> when we calculated rss via proc?

It was catastrophic. Failure of monitoring tools to make forward
progress, long-lived delays of "victim" processes whose locks were held
by /proc/ observers, and the like.


On Sat, Nov 06, 2004 at 08:19:55AM -0800, Christoph Lameter wrote:
> That has its own complications and would require lots of memory with
> systems that already have up to 10k cpus.

Split counters are a solved problem, even for the 10K cpus case.


-- wli

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
       [not found]                 ` <204290000.1099754257@[10.10.2.4]>
@ 2004-11-06 20:51                 ` Rik van Riel
  2004-11-08 15:47                 ` Russ Anderson
                                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 33+ messages in thread
From: Rik van Riel @ 2004-11-06 20:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Martin J. Bligh, Nick Piggin, Benjamin Herrenschmidt,
	Hugh Dickins, linux-mm, linux-ia64

On Sat, 6 Nov 2004, Christoph Lameter wrote:

> Doing a ps is not a frequent event. Of course this may cause
> significant load if one does regularly access /proc entities then. Are
> there any threads from the past with some numbers of what the impact was
> when we calculated rss via proc?

Running top(1) on stock 2.4 kernels pretty much kills large
systems from SGI and IBM.  Think about walking the VM for
10,000 processes, with 3GB virtual memory each, every 3
seconds.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06 16:19                   ` Christoph Lameter
  2004-11-06 20:05                     ` William Lee Irwin III
@ 2004-11-07 16:11                     ` Martin J. Bligh
  2004-11-07 18:25                       ` Matthew Wilcox
  2004-11-08 16:04                       ` Christoph Lameter
  1 sibling, 2 replies; 33+ messages in thread
From: Martin J. Bligh @ 2004-11-07 16:11 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Benjamin Herrenschmidt, Hugh Dickins, linux-mm,
	linux-ia64

>> I would've thought SGI would be more worried about this kind of thing
>> than anyone else ... what's going to happen when you type 'ps' on a large
>> box, and it does this for 10,000 processes?
> 
> Yes but I think this is preferable because of the generally faster
> operations of the vm without having to continually update statistics. And
> these statistics seem to be quite difficult to properly generate (why else
> introduce anon_rss). Without the counters other optimizations are easier
> to do.
> 
> Doing a ps is not a frequent event. Of course this may cause
> significant load if one does regularly access /proc entities then. Are
> there any threads from the past with some numbers of what the impact was
> when we calculated rss via proc?

Doing ps or top is not unusual at all, and the sysadmins should be able
to monitor their system in a reasonable way without crippling it, or even
effecting it significantly.
 
>> If you want to make it quicker, how about doing per-cpu stats, and totalling
>> them at runtime, which'd be lockless, instead of all the atomic ops?
> 
> That has its own complications and would require lots of memory with
> systems that already have up to 10k cpus.

Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
getting rather silly ....

M.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-07 16:11                     ` Martin J. Bligh
@ 2004-11-07 18:25                       ` Matthew Wilcox
  2004-11-08 12:43                         ` Jesse Barnes
  2004-11-08 15:26                         ` Andi Kleen
  2004-11-08 16:04                       ` Christoph Lameter
  1 sibling, 2 replies; 33+ messages in thread
From: Matthew Wilcox @ 2004-11-07 18:25 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Christoph Lameter, Nick Piggin, Benjamin Herrenschmidt,
	Hugh Dickins, linux-mm, linux-ia64

On Sun, Nov 07, 2004 at 08:11:24AM -0800, Martin J. Bligh wrote:
> Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
> getting rather silly ....

NASA bought a 10k CPU system, but that's a cluster.  I think the largest
single system within that cluster is 256 CPUs.

-- 
"Next the statesmen will invent cheap lies, putting the blame upon 
the nation that is attacked, and every man will be glad of those
conscience-soothing falsities, and will diligently study them, and refuse
to examine any refutations of them; and thus he will by and by convince 
himself that the war is just, and will thank God for the better sleep 
he enjoys after this process of grotesque self-deception." -- Mark Twain

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-07 18:25                       ` Matthew Wilcox
@ 2004-11-08 12:43                         ` Jesse Barnes
  2004-11-08 15:26                         ` Andi Kleen
  1 sibling, 0 replies; 33+ messages in thread
From: Jesse Barnes @ 2004-11-08 12:43 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Martin J. Bligh, Christoph Lameter, Nick Piggin,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

On Sunday, November 07, 2004 10:25 am, Matthew Wilcox wrote:
> On Sun, Nov 07, 2004 at 08:11:24AM -0800, Martin J. Bligh wrote:
> > Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
> > getting rather silly ....
>
> NASA bought a 10k CPU system, but that's a cluster.  I think the largest
> single system within that cluster is 256 CPUs.

512 I believe.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-07 18:25                       ` Matthew Wilcox
  2004-11-08 12:43                         ` Jesse Barnes
@ 2004-11-08 15:26                         ` Andi Kleen
  2004-11-08 16:07                           ` Christoph Lameter
  1 sibling, 1 reply; 33+ messages in thread
From: Andi Kleen @ 2004-11-08 15:26 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Martin J. Bligh, Christoph Lameter, Nick Piggin,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

On Sun, Nov 07, 2004 at 06:25:54PM +0000, Matthew Wilcox wrote:
> On Sun, Nov 07, 2004 at 08:11:24AM -0800, Martin J. Bligh wrote:
> > Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
> > getting rather silly ....
> 
> NASA bought a 10k CPU system, but that's a cluster.  I think the largest
> single system within that cluster is 256 CPUs.

512 CPUs AFAIK.

-Andi

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
       [not found]                 ` <204290000.1099754257@[10.10.2.4]>
  2004-11-06 20:51                 ` Rik van Riel
@ 2004-11-08 15:47                 ` Russ Anderson
  2004-11-08 16:08                   ` Martin J. Bligh
  2004-11-10  4:52                   ` Nick Piggin
  2004-11-08 16:35                 ` Rik van Riel
                                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 33+ messages in thread
From: Russ Anderson @ 2004-11-08 15:47 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Martin J. Bligh, Christoph Lameter, Nick Piggin,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

Matthew Wilcox wrote:
> 
> On Sun, Nov 07, 2004 at 08:11:24AM -0800, Martin J. Bligh wrote:
> > Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
> > getting rather silly ....
> 
> NASA bought a 10k CPU system, but that's a cluster.  I think the largest
> single system within that cluster is 256 CPUs.

Each "node" is a single linux kernel with 512 processors..
There are 20 nodes in the cluster.  20 x 512p = 10,240 processors.

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-07 16:11                     ` Martin J. Bligh
  2004-11-07 18:25                       ` Matthew Wilcox
@ 2004-11-08 16:04                       ` Christoph Lameter
  2004-11-08 16:12                         ` Anton Blanchard
                                           ` (3 more replies)
  1 sibling, 4 replies; 33+ messages in thread
From: Christoph Lameter @ 2004-11-08 16:04 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Nick Piggin, Benjamin Herrenschmidt, Hugh Dickins, linux-mm,
	linux-ia64

On Sun, 7 Nov 2004, Martin J. Bligh wrote:

> Doing ps or top is not unusual at all, and the sysadmins should be able
> to monitor their system in a reasonable way without crippling it, or even
> effecting it significantly.

Hmm.. What would you think about a pointer to a stats structure in mm,
which would only be allocated if stats are requested by /proc actions? The
struct would contain a timestamp which would insure that the stats are
only generated in certain intervals and not over and over again. This
would also make it possible to force a regeneration of the numbers.

Maybe lots of other statistical values in mm_struct could then also be
removed?

> Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
> getting rather silly ....

Nope. The future of computing seems to be very high numbers of cpus.
NASAs Columbia has 10k cpus and the new BlueGen solution from IBM is
already at 8k.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 15:26                         ` Andi Kleen
@ 2004-11-08 16:07                           ` Christoph Lameter
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Lameter @ 2004-11-08 16:07 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Matthew Wilcox, Martin J. Bligh, Nick Piggin,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

On Mon, 8 Nov 2004, Andi Kleen wrote:

> > NASA bought a 10k CPU system, but that's a cluster.  I think the largest
> > single system within that cluster is 256 CPUs.
>
> 512 CPUs AFAIK.

We surely would want to go much higher on that....

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 15:47                 ` Russ Anderson
@ 2004-11-08 16:08                   ` Martin J. Bligh
  2004-11-10  4:52                   ` Nick Piggin
  1 sibling, 0 replies; 33+ messages in thread
From: Martin J. Bligh @ 2004-11-08 16:08 UTC (permalink / raw)
  To: Russ Anderson, Matthew Wilcox
  Cc: Christoph Lameter, Nick Piggin, Benjamin Herrenschmidt,
	Hugh Dickins, linux-mm, linux-ia64

>> On Sun, Nov 07, 2004 at 08:11:24AM -0800, Martin J. Bligh wrote:
>> > Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
>> > getting rather silly ....
>> 
>> NASA bought a 10k CPU system, but that's a cluster.  I think the largest
>> single system within that cluster is 256 CPUs.
> 
> Each "node" is a single linux kernel with 512 processors..
> There are 20 nodes in the cluster.  20 x 512p = 10,240 processors.

Fair enough, but for the purposes if this discussion, irrelevant. It's
only each node that counts.

M.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:04                       ` Christoph Lameter
@ 2004-11-08 16:12                         ` Anton Blanchard
  2004-11-08 16:14                         ` Martin J. Bligh
                                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 33+ messages in thread
From: Anton Blanchard @ 2004-11-08 16:12 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Martin J. Bligh, Nick Piggin, Benjamin Herrenschmidt,
	Hugh Dickins, linux-mm, linux-ia64

 
> Nope. The future of computing seems to be very high numbers of cpus.
> NASAs Columbia has 10k cpus and the new BlueGen solution from IBM is
> already at 8k.

Bluegene isnt a fair comparison, its a cluster (and its much more than
8k cpus).

Anton

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:04                       ` Christoph Lameter
  2004-11-08 16:12                         ` Anton Blanchard
@ 2004-11-08 16:14                         ` Martin J. Bligh
  2004-11-08 16:25                           ` Christoph Lameter
  2004-11-08 16:30                         ` Erich Focht
  2004-11-10 11:57                         ` Magnus Damm
  3 siblings, 1 reply; 33+ messages in thread
From: Martin J. Bligh @ 2004-11-08 16:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nick Piggin, Benjamin Herrenschmidt, Hugh Dickins, linux-mm,
	linux-ia64

--Christoph Lameter <clameter@sgi.com> wrote (on Monday, November 08, 2004 08:04:05 -0800):

> On Sun, 7 Nov 2004, Martin J. Bligh wrote:
> 
>> Doing ps or top is not unusual at all, and the sysadmins should be able
>> to monitor their system in a reasonable way without crippling it, or even
>> effecting it significantly.
> 
> Hmm.. What would you think about a pointer to a stats structure in mm,
> which would only be allocated if stats are requested by /proc actions? The
> struct would contain a timestamp which would insure that the stats are
> only generated in certain intervals and not over and over again. This
> would also make it possible to force a regeneration of the numbers.
> 
> Maybe lots of other statistical values in mm_struct could then also be
> removed?

So basically it's the same thing except you're caching it. If you want 
stale old data, you can cache it in userspace, rather than hack the
kernel ... personally, I think it's utterly pointless - if the user didn't
want the data, then they wouldn't be requesting it.
 
>> Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
>> getting rather silly ....
> 
> Nope. The future of computing seems to be very high numbers of cpus.
> NASAs Columbia has 10k cpus and the new BlueGen solution from IBM is
> already at 8k.

I see where the figure comes from, but each node comes with it's own memory,
so it's rather pointless to talk about total over the number of nodes. It'd
make as much sense to tot up the total number of processors on all Linux
installs in the world, and use that figure ;-)

What exactly was the problem? Far too many atomic ops flying around on a
512 CPU machine? Can't you just say that if the mm count is only 1, then 
don't bother with atomic? That'd fix all non-multi-threaded workloads,
I'd think.

M.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:14                         ` Martin J. Bligh
@ 2004-11-08 16:25                           ` Christoph Lameter
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Lameter @ 2004-11-08 16:25 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Nick Piggin, Benjamin Herrenschmidt, Hugh Dickins, linux-mm,
	linux-ia64

On Mon, 8 Nov 2004, Martin J. Bligh wrote:

> > Hmm.. What would you think about a pointer to a stats structure in mm,
> > which would only be allocated if stats are requested by /proc actions? The
> > struct would contain a timestamp which would insure that the stats are
> > only generated in certain intervals and not over and over again. This
> > would also make it possible to force a regeneration of the numbers.
> >
> > Maybe lots of other statistical values in mm_struct could then also be
> > removed?
>
> So basically it's the same thing except you're caching it. If you want
> stale old data, you can cache it in userspace, rather than hack the
> kernel ... personally, I think it's utterly pointless - if the user didn't
> want the data, then they wouldn't be requesting it.

Then we would need userspace caching daemon for /proc?

The caching makes sense since mutiple processes may be interested in the
same data and various views of the data exist (statm vs. status vs. cpu
node information f.e.)

Removing realtime statistics would remove lots of code from the vm.
Maintaining these counters requires locking which interferes with Nick's
and my attempts to parallelize the vm.

Also these counters may develop to be hot spots on a NUMA system. If the
statistics are updated once in a while by a vm scan running on a single
cpu then the issues arising with multiple cpu updating the same memory
locations are less severe.

Thus removing the realtime statistics may result in a
significant performance increase and may also lead to code that is easier
to be maintained since its shorter, simpler and may use less locking.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:04                       ` Christoph Lameter
  2004-11-08 16:12                         ` Anton Blanchard
  2004-11-08 16:14                         ` Martin J. Bligh
@ 2004-11-08 16:30                         ` Erich Focht
  2004-11-08 16:57                           ` Diego Calleja
  2004-11-10 11:57                         ` Magnus Damm
  3 siblings, 1 reply; 33+ messages in thread
From: Erich Focht @ 2004-11-08 16:30 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Martin J. Bligh, Nick Piggin, Benjamin Herrenschmidt,
	Hugh Dickins, linux-mm, linux-ia64

On Monday 08 November 2004 17:04, Christoph Lameter wrote:
> Nope. The future of computing seems to be very high numbers of cpus.
> NASAs Columbia has 10k cpus and the new BlueGen solution from IBM is
> already at 8k.

You're talking about clusters, i.e. multiple running instances of the
operating system. I don't think anybody really wants to go far beyond
512 nowadays. Application-wise 512 cpus/node isn't really needed (but
sometimes nice to have, for marketting). Beyond problems with
scalability of the interconnect (and very uneven latency distribution)
bigger systems would accumulate a too small MTBF. When a broken CPU,
DIMM or other chip takes your entire >1k CPU-machine down, you'll
hapilly exchage it agains a cluster.

Erich


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
                                   ` (2 preceding siblings ...)
  2004-11-08 15:47                 ` Russ Anderson
@ 2004-11-08 16:35                 ` Rik van Riel
  2004-11-08 16:56                 ` Hugh Dickins
                                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 33+ messages in thread
From: Rik van Riel @ 2004-11-08 16:35 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Christoph Lameter, Nick Piggin, Benjamin Herrenschmidt,
	Hugh Dickins, linux-mm, linux-ia64

On Sun, 7 Nov 2004, Martin J. Bligh wrote:

> Doing ps or top is not unusual at all, and the sysadmins should be able
> to monitor their system in a reasonable way without crippling it, or
> even effecting it significantly.

I don't think there is a single system out there where
people throw performance monitoring out the window, in
the name of performance.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
                                   ` (3 preceding siblings ...)
  2004-11-08 16:35                 ` Rik van Riel
@ 2004-11-08 16:56                 ` Hugh Dickins
  2004-11-08 17:01                   ` Christoph Lameter
  2004-11-12  0:38                 ` Christoph Lameter
                                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 33+ messages in thread
From: Hugh Dickins @ 2004-11-08 16:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Martin J. Bligh, Nick Piggin, Benjamin Herrenschmidt, linux-mm,
	linux-ia64

On Mon, 8 Nov 2004, Christoph Lameter wrote:
> 
> Removing realtime statistics would remove lots of code from the vm.

Remove lots of code?  Adding lots nastier.

> Maintaining these counters requires locking which interferes with Nick's
> and my attempts to parallelize the vm.

Aren't you rather overestimating the importance of one single,
ideally atomic, increment per page fault?

It's great news if this is really the major scalability issue facing Linux.

Hugh


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:30                         ` Erich Focht
@ 2004-11-08 16:57                           ` Diego Calleja
  2004-11-08 17:26                             ` Erich Focht
  0 siblings, 1 reply; 33+ messages in thread
From: Diego Calleja @ 2004-11-08 16:57 UTC (permalink / raw)
  To: Erich Focht
  Cc: clameter, mbligh, nickpiggin, benh, hugh, linux-mm, linux-ia64

El Mon, 8 Nov 2004 17:30:37 +0100 Erich Focht <efocht@hpce.nec.com> escribió:

> You're talking about clusters, i.e. multiple running instances of the
> operating system. I don't think anybody really wants to go far beyond
> 512 nowadays. Application-wise 512 cpus/node isn't really needed (but

<the newspaper guy>

SGI is already building one of 1024 CPUs according to some sources:
http://www.computerworld.com/hardwaretopics/hardware/story/0,10801,94564,00.html

but...

"Initially, Pennington said, the system will use two images of Linux -- one
per 512 processors -- while it's being tested and configured. Later, all 1,024
processors will address one image of the SGI Advanced Linux operating system
being used."

Also here ->
http://www.sgi.com/company_info/newsroom/press_releases/2004/november/jaeri.html
it talks about another supercomputer of 2048 CPUs, but I don't find clear
if it's a cluster, or several images. 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:56                 ` Hugh Dickins
@ 2004-11-08 17:01                   ` Christoph Lameter
  2004-11-09  2:40                     ` Nick Piggin
  0 siblings, 1 reply; 33+ messages in thread
From: Christoph Lameter @ 2004-11-08 17:01 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Martin J. Bligh, Nick Piggin, Benjamin Herrenschmidt, linux-mm,
	linux-ia64

On Mon, 8 Nov 2004, Hugh Dickins wrote:

> > Maintaining these counters requires locking which interferes with Nick's
> > and my attempts to parallelize the vm.
>
> Aren't you rather overestimating the importance of one single,
> ideally atomic, increment per page fault?

We would need to investigate that in detail. What we know is that if
multiple cpus do atomic increments with an additional spinlock/unlock etc
as done today then we do have a significant performance impact due to
exclusive cache lines oscillating between cpus.

> It's great news if this is really the major scalability issue facing Linux.

Not sure. This may just be a part of it.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:57                           ` Diego Calleja
@ 2004-11-08 17:26                             ` Erich Focht
  0 siblings, 0 replies; 33+ messages in thread
From: Erich Focht @ 2004-11-08 17:26 UTC (permalink / raw)
  To: Diego Calleja
  Cc: clameter, mbligh, nickpiggin, benh, hugh, linux-mm, linux-ia64

On Monday 08 November 2004 17:57, Diego Calleja wrote:
> El Mon, 8 Nov 2004 17:30:37 +0100 Erich Focht <efocht@hpce.nec.com> escribió:
> 
> > You're talking about clusters, i.e. multiple running instances of the
> > operating system. I don't think anybody really wants to go far beyond
> > 512 nowadays. Application-wise 512 cpus/node isn't really needed (but
> 
> <the newspaper guy>
> 
> SGI is already building one of 1024 CPUs according to some sources:
> http://www.computerworld.com/hardwaretopics/hardware/story/0,10801,94564,00.html
> 
> but...
> 
> "Initially, Pennington said, the system will use two images of Linux -- one
> per 512 processors -- while it's being tested and configured. Later, all 1,024
> processors will address one image of the SGI Advanced Linux operating system
> being used."

1k is not really "far beyond" 512. I'm sure it's doable, but I doubt
that this (or bigger machines) will spread too much. The progress in
cluster interconnect technology and software is just too fast. Think
of price/performance and stability (MTBF accumulation) and judge
yourself. Sure, if Linux could survive breaking hardware, the story
might change.

> Also here ->
> http://www.sgi.com/company_info/newsroom/press_releases/2004/november/jaeri.html
> it talks about another supercomputer of 2048 CPUs, but I don't find clear
> if it's a cluster, or several images. 

That was advertised to be a fraction of the Columbia machine, so a
cluster of big machines.

Erich


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 17:01                   ` Christoph Lameter
@ 2004-11-09  2:40                     ` Nick Piggin
  2004-11-09 12:10                       ` Matthew Wilcox
  0 siblings, 1 reply; 33+ messages in thread
From: Nick Piggin @ 2004-11-09  2:40 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Hugh Dickins, Martin J. Bligh, Benjamin Herrenschmidt, linux-mm,
	linux-ia64

Christoph Lameter wrote:
> On Mon, 8 Nov 2004, Hugh Dickins wrote:
> 
> 
>>>Maintaining these counters requires locking which interferes with Nick's
>>>and my attempts to parallelize the vm.
>>
>>Aren't you rather overestimating the importance of one single,
>>ideally atomic, increment per page fault?
> 
> 
> We would need to investigate that in detail. What we know is that if
> multiple cpus do atomic increments with an additional spinlock/unlock etc
> as done today then we do have a significant performance impact due to
> exclusive cache lines oscillating between cpus.
> 
> 
>>It's great news if this is really the major scalability issue facing Linux.
> 
> 
> Not sure. This may just be a part of it.
> 

I'm sure it would be a part of it. I think we've basically got 3 things
that share cachelines now, they are the mmap_sem, page_table_lock, and
rss/anon_rss.

After removing the page table lock, it tentatively looks like mmap_sem
is the next largest problem. It may be that the mmap_sem cacheline kind
of serialises threads coming into handle_mm_fault, so the rss doesn't
bounce so much. However I might just try ripping out the rss counters
entirely and just see what happens to performance.


I wonder if a per process flag or something could be used to turn off
the statistics counters? I guess statistics could still be gathered for
that process by using your lazy counting functions, Christoph.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-09  2:40                     ` Nick Piggin
@ 2004-11-09 12:10                       ` Matthew Wilcox
  2004-11-10  0:48                         ` Nick Piggin
  0 siblings, 1 reply; 33+ messages in thread
From: Matthew Wilcox @ 2004-11-09 12:10 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Christoph Lameter, Hugh Dickins, Martin J. Bligh,
	Benjamin Herrenschmidt, linux-mm, linux-ia64

On Tue, Nov 09, 2004 at 01:40:20PM +1100, Nick Piggin wrote:
> I wonder if a per process flag or something could be used to turn off
> the statistics counters? I guess statistics could still be gathered for
> that process by using your lazy counting functions, Christoph.

I don't get it.  It seems to me that any process that's going to
experience problems with these statistics counters is going to be
precisely the one that you want the statistics for!  What was the problem
with per-cpu counters again?

-- 
"Next the statesmen will invent cheap lies, putting the blame upon 
the nation that is attacked, and every man will be glad of those
conscience-soothing falsities, and will diligently study them, and refuse
to examine any refutations of them; and thus he will by and by convince 
himself that the war is just, and will thank God for the better sleep 
he enjoys after this process of grotesque self-deception." -- Mark Twain

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-09 12:10                       ` Matthew Wilcox
@ 2004-11-10  0:48                         ` Nick Piggin
  0 siblings, 0 replies; 33+ messages in thread
From: Nick Piggin @ 2004-11-10  0:48 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christoph Lameter, Hugh Dickins, Martin J. Bligh,
	Benjamin Herrenschmidt, linux-mm, linux-ia64

Matthew Wilcox wrote:
> On Tue, Nov 09, 2004 at 01:40:20PM +1100, Nick Piggin wrote:
> 
>>I wonder if a per process flag or something could be used to turn off
>>the statistics counters? I guess statistics could still be gathered for
>>that process by using your lazy counting functions, Christoph.
> 
> 
> I don't get it.  It seems to me that any process that's going to
> experience problems with these statistics counters is going to be
> precisely the one that you want the statistics for!  What was the problem
> with per-cpu counters again?
> 

Not sure if they'd be the ones you want statistics for. If so, then
you're stuck between a rock and a hard place really. However if there
is room for compromise, then this may be a solution.

I think the problem with per-cpu counters is that it wouldn't be a
very good solution for mainline either. It would also penalise all
single threaded tasks for zero gain... quite significantly if you
did a static cacheline aligned array in the mm_struct even with
small CPU counts. Maybe less so resource wise if you used
alloc_percpu, but that would increase complexity. I don't know, maybe
it is an option.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 15:47                 ` Russ Anderson
  2004-11-08 16:08                   ` Martin J. Bligh
@ 2004-11-10  4:52                   ` Nick Piggin
  2004-11-10 17:30                     ` Robin Holt
                                       ` (2 more replies)
  1 sibling, 3 replies; 33+ messages in thread
From: Nick Piggin @ 2004-11-10  4:52 UTC (permalink / raw)
  To: Russ Anderson
  Cc: Matthew Wilcox, Martin J. Bligh, Christoph Lameter,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

Russ Anderson wrote:
> Matthew Wilcox wrote:
> 
>>On Sun, Nov 07, 2004 at 08:11:24AM -0800, Martin J. Bligh wrote:
>>
>>>Ummm 10K cpus? I hope that's a typo for processes, or this discussion is
>>>getting rather silly ....
>>
>>NASA bought a 10k CPU system, but that's a cluster.  I think the largest
>>single system within that cluster is 256 CPUs.
> 
> 
> Each "node" is a single linux kernel with 512 processors..
> There are 20 nodes in the cluster.  20 x 512p = 10,240 processors.
> 

Sorry for wandering off topic here... did I imagine it or did I read
that you'd tried to get 2048 CPUs going in a single system at NASA?

I guess the lack of triumphant press release means it didn't go well,
or that I was imagining things.

Also, are you using 2.6 kernels on these 512 CPU systems? or are your
2.4 kernels still holding together at that many CPUs?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-08 16:04                       ` Christoph Lameter
                                           ` (2 preceding siblings ...)
  2004-11-08 16:30                         ` Erich Focht
@ 2004-11-10 11:57                         ` Magnus Damm
  3 siblings, 0 replies; 33+ messages in thread
From: Magnus Damm @ 2004-11-10 11:57 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Martin J. Bligh, Nick Piggin, Benjamin Herrenschmidt,
	Hugh Dickins, linux-mm, linux-ia64

On Mon, 2004-11-08 at 17:04, Christoph Lameter wrote:
> On Sun, 7 Nov 2004, Martin J. Bligh wrote:
> 
> > Doing ps or top is not unusual at all, and the sysadmins should be able
> > to monitor their system in a reasonable way without crippling it, or even
> > effecting it significantly.
> 
> Hmm.. What would you think about a pointer to a stats structure in mm,
> which would only be allocated if stats are requested by /proc actions? The
> struct would contain a timestamp which would insure that the stats are
> only generated in certain intervals and not over and over again. This
> would also make it possible to force a regeneration of the numbers.

I assume you mean that the mm->rss and mm->rss_anon counters have been
replaced with stat calculation on demand. Maybe it is possible to keep a
needs_update-flag with each vma instead. Then only the vma:s with that
flag set needs to be recalculated. 

A nice feature would be to be able to assign each process/mm a stat
gathering mode - choose between no statistics, statistics updated every
N jiffy and real time statistics. Yeah, dream on.

/ magnus


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-10  4:52                   ` Nick Piggin
@ 2004-11-10 17:30                     ` Robin Holt
  2004-11-10 18:50                     ` Ray Bryant
  2004-11-12 23:45                     ` Ray Bryant
  2 siblings, 0 replies; 33+ messages in thread
From: Robin Holt @ 2004-11-10 17:30 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Russ Anderson, Matthew Wilcox, Martin J. Bligh, Christoph Lameter,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

On Wed, Nov 10, 2004 at 03:52:53PM +1100, Nick Piggin wrote:
> Sorry for wandering off topic here... did I imagine it or did I read
> that you'd tried to get 2048 CPUs going in a single system at NASA?

We could not try it.  The current hardware only supports coherence across
256 nodes with a max of 2 cpus per node.

There is a method to use memory on nodes from the other coherence domains
provided there are no cpus at all in that coherence domain.  In that case,
the hardware uses the coherence domain of the requestor.  What that gives
us is a theoretical case where we could have 1024 nodes with memory, but
only 256 of them with cpus.  I forget what the design limit for memory
per node is, but I do know DIMMs that large are not currently available.

> 
> I guess the lack of triumphant press release means it didn't go well,
> or that I was imagining things.
> 
> Also, are you using 2.6 kernels on these 512 CPU systems? or are your
> 2.4 kernels still holding together at that many CPUs?

Our 2.4 kernel is performing better than the 2.6.  That is because
the 2.4 kernel has a lot more tweaks for our customers than the 2.6.
All of our 2.6 work is being pushed towards the community, so we should
get parity soon.

Robin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-10  4:52                   ` Nick Piggin
  2004-11-10 17:30                     ` Robin Holt
@ 2004-11-10 18:50                     ` Ray Bryant
  2004-11-12 23:45                     ` Ray Bryant
  2 siblings, 0 replies; 33+ messages in thread
From: Ray Bryant @ 2004-11-10 18:50 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Russ Anderson, Matthew Wilcox, Martin J. Bligh, Christoph Lameter,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

Nick Piggin wrote:

> 
> Also, are you using 2.6 kernels on these 512 CPU systems? or are your
> 2.4 kernels still holding together at that many CPUs?
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

We aren't supporting customers with 2.6 kernels yet.  NASA's systems are
all running kernels based on 2.4.x.

-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
                                   ` (4 preceding siblings ...)
  2004-11-08 16:56                 ` Hugh Dickins
@ 2004-11-12  0:38                 ` Christoph Lameter
  2004-11-12 12:50                 ` Robin Holt
  2004-11-12 15:02                 ` Christoph Lameter
  7 siblings, 0 replies; 33+ messages in thread
From: Christoph Lameter @ 2004-11-12  0:38 UTC (permalink / raw)
  To: linux-ia64

On Wed, 10 Nov 2004, Magnus Damm wrote:

> A nice feature would be to be able to assign each process/mm a stat
> gathering mode - choose between no statistics, statistics updated every
> N jiffy and real time statistics. Yeah, dream on.

Hmm... Given the various problems with all these approaches this
may be the right way to go. Working on the dream now.


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
                                   ` (5 preceding siblings ...)
  2004-11-12  0:38                 ` Christoph Lameter
@ 2004-11-12 12:50                 ` Robin Holt
  2004-11-12 15:02                 ` Christoph Lameter
  7 siblings, 0 replies; 33+ messages in thread
From: Robin Holt @ 2004-11-12 12:50 UTC (permalink / raw)
  To: linux-ia64

On Thu, Nov 11, 2004 at 04:38:44PM -0800, Christoph Lameter wrote:
> On Wed, 10 Nov 2004, Magnus Damm wrote:
> 
> > A nice feature would be to be able to assign each process/mm a stat
> > gathering mode - choose between no statistics, statistics updated every
> > N jiffy and real time statistics. Yeah, dream on.
> 
> Hmm... Given the various problems with all these approaches this
> may be the right way to go. Working on the dream now.

Are you saying that there is no way to solve this in a scalable fashion?
Not by seperating the cachelines for the locks and counters?  Not by
putting in per-cpu counters?  Nothing else will work?  What is the
solution for the customers that do have statistics turned on?  What
multiplier does the admin give to the job when they need to turn on
statistics?  How do job-server type sites work with this when one
of their reasons to kill jobs is based on RSS?  They would need it
turned on for nearly every process and we are back in the same boat.

I am embarrased for SGI that this discussion has gone on as long
as it has.  There are hundreds of counters in the system which
have been affected by scaling issues.  Why could those be solved
and this one not?  Can we focus on the correct fix instead of
a kludge which will not be acceptable to our customers?

Sorry for the rant.  It has been a stressful week.

Robin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
                                   ` (6 preceding siblings ...)
  2004-11-12 12:50                 ` Robin Holt
@ 2004-11-12 15:02                 ` Christoph Lameter
  7 siblings, 0 replies; 33+ messages in thread
From: Christoph Lameter @ 2004-11-12 15:02 UTC (permalink / raw)
  To: linux-ia64

On Fri, 12 Nov 2004, Robin Holt wrote:

> Are you saying that there is no way to solve this in a scalable fashion?

No. I am saying that a variety of approaches may be needed and I actually
got a raw draft of a patch here that allows to employ a variety of
ways of obtaining vm statistics.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-10  4:52                   ` Nick Piggin
  2004-11-10 17:30                     ` Robin Holt
  2004-11-10 18:50                     ` Ray Bryant
@ 2004-11-12 23:45                     ` Ray Bryant
  2004-11-13  0:05                       ` Steve Neuner
  2 siblings, 1 reply; 33+ messages in thread
From: Ray Bryant @ 2004-11-12 23:45 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Russ Anderson, Matthew Wilcox, Martin J. Bligh, Christoph Lameter,
	Benjamin Herrenschmidt, Hugh Dickins, linux-mm, linux-ia64

Nick Piggin wrote:

> 
> Also, are you using 2.6 kernels on these 512 CPU systems? or are your
> 2.4 kernels still holding together at that many CPUs?

Nick,

My response to your email was (unfortunately) overly broad.  While the
NASA 512P systems are running a kernel based on 2.4.x, SLES 9, which is 
2.6.5-based, has been certified on a 64p SGI Altix
system (cf. http://developer.novell.com/yes/78980.htm).

So it is possible to buy a supported 2.6 kernel from SuSE that will run your
(up to) 64P Altix.
-- 
Best Regards,
Ray
-----------------------------------------------
                   Ray Bryant
512-453-9679 (work)         512-507-7807 (cell)
raybry@sgi.com             raybry@austin.rr.com
The box said: "Requires Windows 98 or better",
            so I installed Linux.
-----------------------------------------------

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: removing mm->rss and mm->anon_rss from kernel?
  2004-11-12 23:45                     ` Ray Bryant
@ 2004-11-13  0:05                       ` Steve Neuner
  0 siblings, 0 replies; 33+ messages in thread
From: Steve Neuner @ 2004-11-13  0:05 UTC (permalink / raw)
  To: Ray Bryant
  Cc: Nick Piggin, Russ Anderson, Matthew Wilcox, Martin J. Bligh,
	Christoph Lameter, Benjamin Herrenschmidt, Hugh Dickins, linux-mm,
	linux-ia64

> ... SLES 9, which is
> 2.6.5-based, has been certified on a 64p SGI Altix
> system (cf. http://developer.novell.com/yes/78980.htm).
                                              xxxxx

Link correction:
   http://developer.novell.com/yes/77348.htm

--steve

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2004-11-13  0:05 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <4189EC67.40601@yahoo.com.au>
     [not found] ` <Pine.LNX.4.58.0411040820250.8211@schroedinger.engr.sgi.com>
     [not found]   ` <418AD329.3000609@yahoo.com.au>
     [not found]     ` <Pine.LNX.4.58.0411041733270.11583@schroedinger.engr.sgi.com>
     [not found]       ` <418AE0F0.5050908@yahoo.com.au>
     [not found]         ` <418AE9BB.1000602@yahoo.com.au>
     [not found]           ` <1099622957.29587.101.camel@gaston>
     [not found]             ` <418C55A7.9030100@yahoo.com.au>
2004-11-06  9:28               ` removing mm->rss and mm->anon_rss from kernel? Christoph Lameter
     [not found]                 ` <204290000.1099754257@[10.10.2.4]>
2004-11-06 16:19                   ` Christoph Lameter
2004-11-06 20:05                     ` William Lee Irwin III
2004-11-07 16:11                     ` Martin J. Bligh
2004-11-07 18:25                       ` Matthew Wilcox
2004-11-08 12:43                         ` Jesse Barnes
2004-11-08 15:26                         ` Andi Kleen
2004-11-08 16:07                           ` Christoph Lameter
2004-11-08 16:04                       ` Christoph Lameter
2004-11-08 16:12                         ` Anton Blanchard
2004-11-08 16:14                         ` Martin J. Bligh
2004-11-08 16:25                           ` Christoph Lameter
2004-11-08 16:30                         ` Erich Focht
2004-11-08 16:57                           ` Diego Calleja
2004-11-08 17:26                             ` Erich Focht
2004-11-10 11:57                         ` Magnus Damm
2004-11-06 20:51                 ` Rik van Riel
2004-11-08 15:47                 ` Russ Anderson
2004-11-08 16:08                   ` Martin J. Bligh
2004-11-10  4:52                   ` Nick Piggin
2004-11-10 17:30                     ` Robin Holt
2004-11-10 18:50                     ` Ray Bryant
2004-11-12 23:45                     ` Ray Bryant
2004-11-13  0:05                       ` Steve Neuner
2004-11-08 16:35                 ` Rik van Riel
2004-11-08 16:56                 ` Hugh Dickins
2004-11-08 17:01                   ` Christoph Lameter
2004-11-09  2:40                     ` Nick Piggin
2004-11-09 12:10                       ` Matthew Wilcox
2004-11-10  0:48                         ` Nick Piggin
2004-11-12  0:38                 ` Christoph Lameter
2004-11-12 12:50                 ` Robin Holt
2004-11-12 15:02                 ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox