+ memory-controller-memory-accounting-v7.patch added to -mm tree

All of lore.kernel.org
 help / color / mirror / Atom feed

* + memory-controller-memory-accounting-v7.patch added to -mm tree
@ 2007-08-27 21:19 akpm
  2007-08-28  6:35 ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: akpm @ 2007-08-27 21:19 UTC (permalink / raw)
  To: mm-commits
  Cc: balbir, a.p.zijlstra, dev, ebiederm, herbert, menage, nickpiggin,
	rientjes, svaidy, xemul


The patch titled
     Memory controller: memory accounting
has been added to the -mm tree.  Its filename is
     memory-controller-memory-accounting-v7.patch

*** Remember to use Documentation/SubmitChecklist when testing your code ***

See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find
out what to do about this

------------------------------------------------------
Subject: Memory controller: memory accounting
From: Balbir Singh <balbir@linux.vnet.ibm.com>

Add the accounting hooks.  The accounting is carried out for RSS and Page
Cache (unmapped) pages.  There is now a common limit and accounting for both. 
The RSS accounting is accounted at page_add_*_rmap() and page_remove_rmap()
time.  Page cache is accounted at add_to_page_cache(),
__delete_from_page_cache().  Swap cache is also accounted for.

Each page's page_container is protected with the last bit of the
page_container pointer, this makes handling of race conditions involving
simultaneous mappings of a page easier.  A reference count is kept in the
page_container to deal with cases where a page might be unmapped from the RSS
of all tasks, but still lives in the page cache.

Credits go to Vaidyanathan Srinivasan for helping with reference counting work
of the page container.  Almost all of the page cache accounting code has help
from Vaidyanathan Srinivasan.

Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/memcontrol.h |   20 ++++
 mm/filemap.c               |   12 ++
 mm/memcontrol.c            |  166 ++++++++++++++++++++++++++++++++++-
 mm/memory.c                |   43 ++++++++-
 mm/migrate.c               |    6 +
 mm/page_alloc.c            |    3 
 mm/rmap.c                  |   17 +++
 mm/swap_state.c            |   12 ++
 mm/swapfile.c              |   41 +++++---
 9 files changed, 293 insertions(+), 27 deletions(-)

diff -puN include/linux/memcontrol.h~memory-controller-memory-accounting-v7 include/linux/memcontrol.h
--- a/include/linux/memcontrol.h~memory-controller-memory-accounting-v7
+++ a/include/linux/memcontrol.h
@@ -30,6 +30,13 @@ extern void mm_free_container(struct mm_
 extern void page_assign_page_container(struct page *page,
 					struct page_container *pc);
 extern struct page_container *page_get_page_container(struct page *page);
+extern int mem_container_charge(struct page *page, struct mm_struct *mm);
+extern void mem_container_uncharge(struct page_container *pc);
+
+static inline void mem_container_uncharge_page(struct page *page)
+{
+	mem_container_uncharge(page_get_page_container(page));
+}
 
 #else /* CONFIG_CONTAINER_MEM_CONT */
 static inline void mm_init_container(struct mm_struct *mm,
@@ -51,6 +58,19 @@ static inline struct page_container *pag
 	return NULL;
 }
 
+static inline int mem_container_charge(struct page *page, struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mem_container_uncharge(struct page_container *pc)
+{
+}
+
+static inline void mem_container_uncharge_page(struct page *page)
+{
+}
+
 #endif /* CONFIG_CONTAINER_MEM_CONT */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff -puN mm/filemap.c~memory-controller-memory-accounting-v7 mm/filemap.c
--- a/mm/filemap.c~memory-controller-memory-accounting-v7
+++ a/mm/filemap.c
@@ -31,6 +31,7 @@
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 /*
@@ -116,6 +117,7 @@ void __remove_from_page_cache(struct pag
 {
 	struct address_space *mapping = page->mapping;
 
+	mem_container_uncharge_page(page);
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
@@ -441,6 +443,11 @@ int add_to_page_cache(struct page *page,
 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
 	if (error == 0) {
+
+		error = mem_container_charge(page, current->mm);
+		if (error)
+			goto out;
+
 		write_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
@@ -450,10 +457,13 @@ int add_to_page_cache(struct page *page,
 			page->index = offset;
 			mapping->nrpages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
-		}
+		} else
+			mem_container_uncharge_page(page);
+
 		write_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
 	}
+out:
 	return error;
 }
 EXPORT_SYMBOL(add_to_page_cache);
diff -puN mm/memcontrol.c~memory-controller-memory-accounting-v7 mm/memcontrol.c
--- a/mm/memcontrol.c~memory-controller-memory-accounting-v7
+++ a/mm/memcontrol.c
@@ -21,6 +21,9 @@
 #include <linux/memcontrol.h>
 #include <linux/container.h>
 #include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
 
 struct container_subsys mem_container_subsys;
 
@@ -31,7 +34,9 @@ struct container_subsys mem_container_su
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark.
+ * we hit the water mark. May be even add a low water mark, such that
+ * no reclaim occurs from a container at it's low water mark, this is
+ * a feature that will be implemented much later in the future.
  */
 struct mem_container {
 	struct container_subsys_state css;
@@ -49,6 +54,14 @@ struct mem_container {
 };
 
 /*
+ * We use the lower bit of the page->page_container pointer as a bit spin
+ * lock. We need to ensure that page->page_container is atleast two
+ * byte aligned (based on comments from Nick Piggin)
+ */
+#define PAGE_CONTAINER_LOCK_BIT 	0x0
+#define PAGE_CONTAINER_LOCK 		(1 << PAGE_CONTAINER_LOCK_BIT)
+
+/*
  * A page_container page is associated with every page descriptor. The
  * page_container helps us identify information about the container
  */
@@ -56,6 +69,8 @@ struct page_container {
 	struct list_head lru;		/* per container LRU list */
 	struct page *page;
 	struct mem_container *mem_container;
+	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
+					/* mapped and cached states     */
 };
 
 
@@ -88,14 +103,157 @@ void mm_free_container(struct mm_struct 
 	css_put(&mm->mem_container->css);
 }
 
+static inline int page_container_locked(struct page *page)
+{
+	return bit_spin_is_locked(PAGE_CONTAINER_LOCK_BIT,
+					&page->page_container);
+}
+
 void page_assign_page_container(struct page *page, struct page_container *pc)
 {
-	page->page_container = (unsigned long)pc;
+	int locked;
+
+	/*
+	 * While resetting the page_container we might not hold the
+	 * page_container lock. free_hot_cold_page() is an example
+	 * of such a scenario
+	 */
+	if (pc)
+		VM_BUG_ON(!page_container_locked(page));
+	locked = (page->page_container & PAGE_CONTAINER_LOCK);
+	page->page_container = ((unsigned long)pc | locked);
 }
 
 struct page_container *page_get_page_container(struct page *page)
 {
-	return page->page_container;
+	return (struct page_container *)
+		(page->page_container & ~PAGE_CONTAINER_LOCK);
+}
+
+void __always_inline lock_page_container(struct page *page)
+{
+	bit_spin_lock(PAGE_CONTAINER_LOCK_BIT, &page->page_container);
+	VM_BUG_ON(!page_container_locked(page));
+}
+
+void __always_inline unlock_page_container(struct page *page)
+{
+	bit_spin_unlock(PAGE_CONTAINER_LOCK_BIT, &page->page_container);
+}
+
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the container is over its limit
+ */
+int mem_container_charge(struct page *page, struct mm_struct *mm)
+{
+	struct mem_container *mem;
+	struct page_container *pc, *race_pc;
+
+	/*
+	 * Should page_container's go to their own slab?
+	 * One could optimize the performance of the charging routine
+	 * by saving a bit in the page_flags and using it as a lock
+	 * to see if the container page already has a page_container associated
+	 * with it
+	 */
+	lock_page_container(page);
+	pc = page_get_page_container(page);
+	/*
+	 * The page_container exists and the page has already been accounted
+	 */
+	if (pc) {
+		atomic_inc(&pc->ref_cnt);
+		goto done;
+	}
+
+	unlock_page_container(page);
+
+	pc = kzalloc(sizeof(struct page_container), GFP_KERNEL);
+	if (pc == NULL)
+		goto err;
+
+	rcu_read_lock();
+	/*
+	 * We always charge the container the mm_struct belongs to
+	 * the mm_struct's mem_container changes on task migration if the
+	 * thread group leader migrates. It's possible that mm is not
+	 * set, if so charge the init_mm (happens for pagecache usage).
+	 */
+	if (!mm)
+		mm = &init_mm;
+
+	mem = rcu_dereference(mm->mem_container);
+	/*
+	 * For every charge from the container, increment reference
+	 * count
+	 */
+	css_get(&mem->css);
+	rcu_read_unlock();
+
+	/*
+	 * If we created the page_container, we should free it on exceeding
+	 * the container limit.
+	 */
+	if (res_counter_charge(&mem->res, 1)) {
+		css_put(&mem->css);
+		goto free_pc;
+	}
+
+	lock_page_container(page);
+	/*
+	 * Check if somebody else beat us to allocating the page_container
+	 */
+	race_pc = page_get_page_container(page);
+	if (race_pc) {
+		kfree(pc);
+		pc = race_pc;
+		atomic_inc(&pc->ref_cnt);
+		res_counter_uncharge(&mem->res, 1);
+		css_put(&mem->css);
+		goto done;
+	}
+
+	atomic_set(&pc->ref_cnt, 1);
+	pc->mem_container = mem;
+	pc->page = page;
+	page_assign_page_container(page, pc);
+
+done:
+	unlock_page_container(page);
+	return 0;
+free_pc:
+	kfree(pc);
+	return -ENOMEM;
+err:
+	unlock_page_container(page);
+	return -ENOMEM;
+}
+
+/*
+ * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge.
+ */
+void mem_container_uncharge(struct page_container *pc)
+{
+	struct mem_container *mem;
+	struct page *page;
+
+	if (!pc)
+		return;
+
+	if (atomic_dec_and_test(&pc->ref_cnt)) {
+		page = pc->page;
+		lock_page_container(page);
+		mem = pc->mem_container;
+		css_put(&mem->css);
+		page_assign_page_container(page, NULL);
+		unlock_page_container(page);
+		res_counter_uncharge(&mem->res, 1);
+		kfree(pc);
+	}
 }
 
 static ssize_t mem_container_read(struct container *cont, struct cftype *cft,
@@ -150,6 +308,8 @@ mem_container_create(struct container_su
 		return NULL;
 
 	res_counter_init(&mem->res);
+	INIT_LIST_HEAD(&mem->active_list);
+	INIT_LIST_HEAD(&mem->inactive_list);
 	return &mem->css;
 }
 
diff -puN mm/memory.c~memory-controller-memory-accounting-v7 mm/memory.c
--- a/mm/memory.c~memory-controller-memory-accounting-v7
+++ a/mm/memory.c
@@ -50,6 +50,7 @@
 #include <linux/delayacct.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
+#include <linux/memcontrol.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -1136,14 +1137,18 @@ static int insert_page(struct mm_struct 
 	pte_t *pte;
 	spinlock_t *ptl;  
 
+	retval = mem_container_charge(page, mm);
+	if (retval)
+		goto out;
+
 	retval = -EINVAL;
 	if (PageAnon(page))
-		goto out;
+		goto out_uncharge;
 	retval = -ENOMEM;
 	flush_dcache_page(page);
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
-		goto out;
+		goto out_uncharge;
 	retval = -EBUSY;
 	if (!pte_none(*pte))
 		goto out_unlock;
@@ -1155,8 +1160,11 @@ static int insert_page(struct mm_struct 
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
 	retval = 0;
+	return retval;
 out_unlock:
 	pte_unmap_unlock(pte, ptl);
+out_uncharge:
+	mem_container_uncharge_page(page);
 out:
 	return retval;
 }
@@ -1629,6 +1637,9 @@ gotten:
 		goto oom;
 	cow_user_page(new_page, old_page, address, vma);
 
+	if (mem_container_charge(new_page, mm))
+		goto oom_free_new;
+
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
@@ -1660,7 +1671,9 @@ gotten:
 		/* Free the old page.. */
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
-	}
+	} else
+		mem_container_uncharge_page(new_page);
+
 	if (new_page)
 		page_cache_release(new_page);
 	if (old_page)
@@ -1681,6 +1694,8 @@ unlock:
 		put_page(dirty_page);
 	}
 	return ret;
+oom_free_new:
+	__free_page(new_page);
 oom:
 	if (old_page)
 		page_cache_release(old_page);
@@ -2085,6 +2100,11 @@ static int do_swap_page(struct mm_struct
 	}
 
 	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+	if (mem_container_charge(page, mm)) {
+		ret = VM_FAULT_OOM;
+		goto out;
+	}
+
 	mark_page_accessed(page);
 	lock_page(page);
 
@@ -2121,8 +2141,10 @@ static int do_swap_page(struct mm_struct
 	if (write_access) {
 		/* XXX: We could OR the do_wp_page code with this one? */
 		if (do_wp_page(mm, vma, address,
-				page_table, pmd, ptl, pte) & VM_FAULT_OOM)
+				page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
+			mem_container_uncharge_page(page);
 			ret = VM_FAULT_OOM;
+		}
 		goto out;
 	}
 
@@ -2133,6 +2155,7 @@ unlock:
 out:
 	return ret;
 out_nomap:
+	mem_container_uncharge_page(page);
 	pte_unmap_unlock(page_table, ptl);
 	unlock_page(page);
 	page_cache_release(page);
@@ -2161,6 +2184,9 @@ static int do_anonymous_page(struct mm_s
 	if (!page)
 		goto oom;
 
+		if (mem_container_charge(page, mm))
+			goto oom_free_page;
+
 	entry = mk_pte(page, vma->vm_page_prot);
 	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
@@ -2178,8 +2204,11 @@ unlock:
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
 release:
+	mem_container_uncharge_page(page);
 	page_cache_release(page);
 	goto unlock;
+oom_free_page:
+	__free_page(page);
 oom:
 	return VM_FAULT_OOM;
 }
@@ -2290,6 +2319,11 @@ static int __do_fault(struct mm_struct *
 
 	}
 
+	if (mem_container_charge(page, mm)) {
+		ret = VM_FAULT_OOM;
+		goto out;
+	}
+
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
 	/*
@@ -2325,6 +2359,7 @@ static int __do_fault(struct mm_struct *
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
 	} else {
+		mem_container_uncharge_page(page);
 		if (anon)
 			page_cache_release(page);
 		else
diff -puN mm/migrate.c~memory-controller-memory-accounting-v7 mm/migrate.c
--- a/mm/migrate.c~memory-controller-memory-accounting-v7
+++ a/mm/migrate.c
@@ -29,6 +29,7 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
+#include <linux/memcontrol.h>
 
 #include "internal.h"
 
@@ -157,6 +158,11 @@ static void remove_migration_pte(struct 
  		return;
  	}
 
+	if (mem_container_charge(new, mm)) {
+		pte_unmap(ptep);
+		return;
+	}
+
  	ptl = pte_lockptr(mm, pmd);
  	spin_lock(ptl);
 	pte = *ptep;
diff -puN mm/page_alloc.c~memory-controller-memory-accounting-v7 mm/page_alloc.c
--- a/mm/page_alloc.c~memory-controller-memory-accounting-v7
+++ a/mm/page_alloc.c
@@ -42,6 +42,7 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
+#include <linux/memcontrol.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1019,6 +1020,7 @@ static void fastcall free_hot_cold_page(
 
 	if (!PageHighMem(page))
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+	page_assign_page_container(page, NULL);
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
@@ -2561,6 +2563,7 @@ void __meminit memmap_init_zone(unsigned
 		set_page_links(page, zone, nid, pfn);
 		init_page_count(page);
 		reset_page_mapcount(page);
+		page_assign_page_container(page, NULL);
 		SetPageReserved(page);
 
 		/*
diff -puN mm/rmap.c~memory-controller-memory-accounting-v7 mm/rmap.c
--- a/mm/rmap.c~memory-controller-memory-accounting-v7
+++ a/mm/rmap.c
@@ -48,6 +48,7 @@
 #include <linux/rcupdate.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/memcontrol.h>
 
 #include <asm/tlbflush.h>
 
@@ -550,8 +551,14 @@ void page_add_anon_rmap(struct page *pag
 	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	if (atomic_inc_and_test(&page->_mapcount))
 		__page_set_anon_rmap(page, vma, address);
-	else
+	else {
 		__page_check_anon_rmap(page, vma, address);
+		/*
+		 * We unconditionally charged during prepare, we uncharge here
+		 * This takes care of balancing the reference counts
+		 */
+		mem_container_uncharge_page(page);
+	}
 }
 
 /*
@@ -582,6 +589,12 @@ void page_add_file_rmap(struct page *pag
 {
 	if (atomic_inc_and_test(&page->_mapcount))
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
+	else
+		/*
+		 * We unconditionally charged during prepare, we uncharge here
+		 * This takes care of balancing the reference counts
+		 */
+		mem_container_uncharge_page(page);
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -642,6 +655,8 @@ void page_remove_rmap(struct page *page,
 			page_clear_dirty(page);
 			set_page_dirty(page);
 		}
+		mem_container_uncharge_page(page);
+
 		__dec_zone_page_state(page,
 				PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
 	}
diff -puN mm/swap_state.c~memory-controller-memory-accounting-v7 mm/swap_state.c
--- a/mm/swap_state.c~memory-controller-memory-accounting-v7
+++ a/mm/swap_state.c
@@ -17,6 +17,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
+#include <linux/memcontrol.h>
 
 #include <asm/pgtable.h>
 
@@ -80,6 +81,11 @@ static int __add_to_swap_cache(struct pa
 	BUG_ON(PagePrivate(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
+
+		error = mem_container_charge(page, current->mm);
+		if (error)
+			goto out;
+
 		write_lock_irq(&swapper_space.tree_lock);
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
@@ -89,10 +95,13 @@ static int __add_to_swap_cache(struct pa
 			set_page_private(page, entry.val);
 			total_swapcache_pages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
-		}
+		} else
+			mem_container_uncharge_page(page);
+
 		write_unlock_irq(&swapper_space.tree_lock);
 		radix_tree_preload_end();
 	}
+out:
 	return error;
 }
 
@@ -132,6 +141,7 @@ void __delete_from_swap_cache(struct pag
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
+	mem_container_uncharge_page(page);
 	radix_tree_delete(&swapper_space.page_tree, page_private(page));
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
diff -puN mm/swapfile.c~memory-controller-memory-accounting-v7 mm/swapfile.c
--- a/mm/swapfile.c~memory-controller-memory-accounting-v7
+++ a/mm/swapfile.c
@@ -27,6 +27,7 @@
 #include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/memcontrol.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -506,9 +507,12 @@ unsigned int count_swap_pages(int type, 
  * just let do_wp_page work it out if a write is requested later - to
  * force COW, vm_page_prot omits write permission from any private vma.
  */
-static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
+	if (mem_container_charge(page, vma->vm_mm))
+		return -ENOMEM;
+
 	inc_mm_counter(vma->vm_mm, anon_rss);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +524,7 @@ static void unuse_pte(struct vm_area_str
 	 * immediately swapped out again after swapon.
 	 */
 	activate_page(page);
+	return 1;
 }
 
 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -529,7 +534,7 @@ static int unuse_pte_range(struct vm_are
 	pte_t swp_pte = swp_entry_to_pte(entry);
 	pte_t *pte;
 	spinlock_t *ptl;
-	int found = 0;
+	int ret = 0;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
@@ -538,13 +543,12 @@ static int unuse_pte_range(struct vm_are
 		 * Test inline before going to call unuse_pte.
 		 */
 		if (unlikely(pte_same(*pte, swp_pte))) {
-			unuse_pte(vma, pte++, addr, entry, page);
-			found = 1;
+			ret = unuse_pte(vma, pte++, addr, entry, page);
 			break;
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(pte - 1, ptl);
-	return found;
+	return ret;
 }
 
 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -553,14 +557,16 @@ static inline int unuse_pmd_range(struct
 {
 	pmd_t *pmd;
 	unsigned long next;
+	int ret;
 
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (unuse_pte_range(vma, pmd, addr, next, entry, page))
-			return 1;
+		ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+		if (ret)
+			return ret;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
@@ -571,14 +577,16 @@ static inline int unuse_pud_range(struct
 {
 	pud_t *pud;
 	unsigned long next;
+	int ret;
 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (unuse_pmd_range(vma, pud, addr, next, entry, page))
-			return 1;
+		ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+		if (ret)
+			return ret;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
@@ -588,6 +596,7 @@ static int unuse_vma(struct vm_area_stru
 {
 	pgd_t *pgd;
 	unsigned long addr, end, next;
+	int ret;
 
 	if (page->mapping) {
 		addr = page_address_in_vma(page, vma);
@@ -605,8 +614,9 @@ static int unuse_vma(struct vm_area_stru
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (unuse_pud_range(vma, pgd, addr, next, entry, page))
-			return 1;
+		ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+		if (ret)
+			return ret;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
@@ -615,6 +625,7 @@ static int unuse_mm(struct mm_struct *mm
 				swp_entry_t entry, struct page *page)
 {
 	struct vm_area_struct *vma;
+	int ret = 0;
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		/*
@@ -627,15 +638,11 @@ static int unuse_mm(struct mm_struct *mm
 		lock_page(page);
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-		if (vma->anon_vma && unuse_vma(vma, entry, page))
+		if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
 			break;
 	}
 	up_read(&mm->mmap_sem);
-	/*
-	 * Currently unuse_mm cannot fail, but leave error handling
-	 * at call sites for now, since we change it from time to time.
-	 */
-	return 0;
+	return ret;
 }
 
 /*
_

Patches currently in -mm which might be from balbir@linux.vnet.ibm.com are

origin.patch
memoryless-nodes-fixup-uses-of-node_online_map-in-generic-code-fix.patch
clean-up-duplicate-includes-in-documentation.patch
add-containerstats-v3.patch
add-containerstats-v3-fix.patch
memory-controller-add-documentation.patch
memory-controller-resource-counters-v7.patch
memory-controller-containers-setup-v7.patch
memory-controller-accounting-setup-v7.patch
memory-controller-memory-accounting-v7.patch
memory-controller-task-migration-v7.patch
memory-controller-add-per-container-lru-and-reclaim-v7.patch
memory-controller-oom-handling-v7.patch
memory-controller-add-switch-to-control-what-type-of-pages-to-limit-v7.patch
memory-controller-make-page_referenced-container-aware-v7.patch

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-27 21:19 + memory-controller-memory-accounting-v7.patch added to -mm tree akpm
@ 2007-08-28  6:35 ` Nick Piggin
  2007-08-28  7:26   ` Balbir Singh
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2007-08-28  6:35 UTC (permalink / raw)
  To: akpm
  Cc: balbir, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

akpm@linux-foundation.org wrote:
> The patch titled
>      Memory controller: memory accounting

What's the plans to deal with other types of pages other than pagecache?
Surely not adding hooks on a case-by-case basis throughout all the kernel?
Or do we not actually care about giving real guarantees about memory
availability? Presumably you do, otherwise you wouldn't be carefully
charging things before you actually insert them and uncharing afterwards
on failure, etc.

If you do, then I would have thought the best way to go would be to start
with the simplest approach, and add things like targetted reclaim after
that.

I just don't want to see little bits to add more and more hooks slowly go
in under the radar ;) So... there is a coherent plan, right?

> +/*
> + * Charge the memory controller for page usage.
> + * Return
> + * 0 if the charge was successful
> + * < 0 if the container is over its limit
> + */
> +int mem_container_charge(struct page *page, struct mm_struct *mm)
> +{
> +	struct mem_container *mem;
> +	struct page_container *pc, *race_pc;
> +
> +	/*
> +	 * Should page_container's go to their own slab?
> +	 * One could optimize the performance of the charging routine
> +	 * by saving a bit in the page_flags and using it as a lock
> +	 * to see if the container page already has a page_container associated
> +	 * with it
> +	 */
> +	lock_page_container(page);
> +	pc = page_get_page_container(page);
> +	/*
> +	 * The page_container exists and the page has already been accounted
> +	 */
> +	if (pc) {
> +		atomic_inc(&pc->ref_cnt);
> +		goto done;
> +	}
> +
> +	unlock_page_container(page);
> +
> +	pc = kzalloc(sizeof(struct page_container), GFP_KERNEL);
> +	if (pc == NULL)
> +		goto err;
> +
> +	rcu_read_lock();
> +	/*
> +	 * We always charge the container the mm_struct belongs to
> +	 * the mm_struct's mem_container changes on task migration if the
> +	 * thread group leader migrates. It's possible that mm is not
> +	 * set, if so charge the init_mm (happens for pagecache usage).
> +	 */
> +	if (!mm)
> +		mm = &init_mm;
> +
> +	mem = rcu_dereference(mm->mem_container);
> +	/*
> +	 * For every charge from the container, increment reference
> +	 * count
> +	 */
> +	css_get(&mem->css);
> +	rcu_read_unlock();

Where's the corresponding rcu_assign_pointer? Oh, in the next patch. That's
unconventional (can you rearrange it so they go in together, please?).

> @@ -1629,6 +1637,9 @@ gotten:
>  		goto oom;
>  	cow_user_page(new_page, old_page, address, vma);
>  
> +	if (mem_container_charge(new_page, mm))
> +		goto oom_free_new;
> +
>  	/*
>  	 * Re-check the pte - we dropped the lock
>  	 */
> @@ -1660,7 +1671,9 @@ gotten:
>  		/* Free the old page.. */
>  		new_page = old_page;
>  		ret |= VM_FAULT_WRITE;
> -	}
> +	} else
> +		mem_container_uncharge_page(new_page);
> +
>  	if (new_page)
>  		page_cache_release(new_page);
>  	if (old_page)
> @@ -1681,6 +1694,8 @@ unlock:
>  		put_page(dirty_page);
>  	}
>  	return ret;
> +oom_free_new:
> +	__free_page(new_page);
>  oom:
>  	if (old_page)
>  		page_cache_release(old_page);
> @@ -2085,6 +2100,11 @@ static int do_swap_page(struct mm_struct
>  	}
>  
>  	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
> +	if (mem_container_charge(page, mm)) {
> +		ret = VM_FAULT_OOM;
> +		goto out;
> +	}

The oom-from-pagefault path is quite crap (not this code, mainline).
It doesn't obey any of the given oom killing policy.

I had a patch to fix this, but nobody else was too concerned at that
stage. I don't expect that path would have been hit very much before,
but with a lot of -EFAULTs coming out of containers, I think
VM_FAULT_OOMs could easily trigger in practice now.

http://lkml.org/lkml/2006/10/12/158

Patch may be a little outdated, but the basics should still work. You
might be in a good position to test it with these container patches?

> diff -puN mm/rmap.c~memory-controller-memory-accounting-v7 mm/rmap.c
> --- a/mm/rmap.c~memory-controller-memory-accounting-v7
> +++ a/mm/rmap.c
> @@ -48,6 +48,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/module.h>
>  #include <linux/kallsyms.h>
> +#include <linux/memcontrol.h>
>  
>  #include <asm/tlbflush.h>
>  
> @@ -550,8 +551,14 @@ void page_add_anon_rmap(struct page *pag
>  	VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
>  	if (atomic_inc_and_test(&page->_mapcount))
>  		__page_set_anon_rmap(page, vma, address);
> -	else
> +	else {
>  		__page_check_anon_rmap(page, vma, address);
> +		/*
> +		 * We unconditionally charged during prepare, we uncharge here
> +		 * This takes care of balancing the reference counts
> +		 */
> +		mem_container_uncharge_page(page);
> +	}
>  }
>  
>  /*
> @@ -582,6 +589,12 @@ void page_add_file_rmap(struct page *pag
>  {
>  	if (atomic_inc_and_test(&page->_mapcount))
>  		__inc_zone_page_state(page, NR_FILE_MAPPED);
> +	else
> +		/*
> +		 * We unconditionally charged during prepare, we uncharge here
> +		 * This takes care of balancing the reference counts
> +		 */
> +		mem_container_uncharge_page(page);
>  }

What's "during prepare"? Better would be "before adding the page to
page tables" or something.

But... why do you do it? The refcounts and charging are alrady taken
care of in your charge function, aren't they? Just unconditionally
charge at map time and unconditionally uncharge at unmap time, and
let your accounting implementation take care of what to actually do.

(This is what I mean about mem container creeping into core code --
it's fine to have some tasteful hooks, but introducing these more
complex interactions between core VM and container accounting details
is nasty).

I would much prefer this whole thing to _not_ to hook into rmap like
this at all. Do the call unconditionally, and your container
implementation can do as much weird and wonderful refcounting as its
heart desires.

>  #ifdef CONFIG_DEBUG_VM
> @@ -642,6 +655,8 @@ void page_remove_rmap(struct page *page,
>  			page_clear_dirty(page);
>  			set_page_dirty(page);
>  		}
> +		mem_container_uncharge_page(page);
> +
>  		__dec_zone_page_state(page,
>  				PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
>  	}
> diff -puN mm/swap_state.c~memory-controller-memory-accounting-v7 mm/swap_state.c
> --- a/mm/swap_state.c~memory-controller-memory-accounting-v7
> +++ a/mm/swap_state.c
> @@ -17,6 +17,7 @@
>  #include <linux/backing-dev.h>
>  #include <linux/pagevec.h>
>  #include <linux/migrate.h>
> +#include <linux/memcontrol.h>
>  
>  #include <asm/pgtable.h>
>  
> @@ -80,6 +81,11 @@ static int __add_to_swap_cache(struct pa
>  	BUG_ON(PagePrivate(page));
>  	error = radix_tree_preload(gfp_mask);
>  	if (!error) {
> +
> +		error = mem_container_charge(page, current->mm);
> +		if (error)
> +			goto out;
> +
>  		write_lock_irq(&swapper_space.tree_lock);
>  		error = radix_tree_insert(&swapper_space.page_tree,
>  						entry.val, page);
> @@ -89,10 +95,13 @@ static int __add_to_swap_cache(struct pa
>  			set_page_private(page, entry.val);
>  			total_swapcache_pages++;
>  			__inc_zone_page_state(page, NR_FILE_PAGES);
> -		}
> +		} else
> +			mem_container_uncharge_page(page);
> +
>  		write_unlock_irq(&swapper_space.tree_lock);
>  		radix_tree_preload_end();
>  	}
> +out:
>  	return error;
>  }

Uh. You have to be really careful here (and in add_to_page_cache, and
possibly others I haven't really looked)... you're ignoring gfp masks
because mem_container_charge allocates with GFP_KERNEL. You can have
all sorts of GFP_ATOMIC, GFP_NOIO, NOFS etc. come in these places that
will deadlock.

I think I would much prefer you make mem_container_charge run atomically,
and then have just a single hook at the site of where everything else is
checked, rather than having all this pre-charge post-uncharge messiness.
Ditto for mm/memory.c... all call sites really... should make them much
nicer looking.

...

I didn't look at all code in all patches. But overall I guess it isn't
looking too bad (though I'd really like people like Andrea and Hugh to
take a look as well, eventually). It is still a lot uglier in terms of
core mm hooks than I would like, but with luck that could be improved
(some of my suggestions might even help a bit).

Haven't looked at the targetted reclaim patches yet (sigh, more
container tentacles :P).

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-28  6:35 ` Nick Piggin
@ 2007-08-28  7:26   ` Balbir Singh
  2007-08-28  9:29     ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Balbir Singh @ 2007-08-28  7:26 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Nick Piggin wrote:
> akpm@linux-foundation.org wrote:
>> The patch titled
>>      Memory controller: memory accounting
> 
> What's the plans to deal with other types of pages other than pagecache?
> Surely not adding hooks on a case-by-case basis throughout all the kernel?
> Or do we not actually care about giving real guarantees about memory
> availability? Presumably you do, otherwise you wouldn't be carefully
> charging things before you actually insert them and uncharing afterwards
> on failure, etc.
> 

We deal with RSS and Page/Swap Cache in this controller.

> If you do, then I would have thought the best way to go would be to start
> with the simplest approach, and add things like targetted reclaim after
> that.
> 

I think the kernel accounting/control system will have to be an independent
controller (since we cannot reclaim kernel pages, just limit them by
accounting (except for some slab pages which are free)).

> I just don't want to see little bits to add more and more hooks slowly go
> in under the radar ;) So... there is a coherent plan, right?
> 

Nothing will go under the radar :-) Everything will be posted to linux-mm
and linux-kernel. We'll make sure you don't wake up one day and see
the mm code has changed to something you cannot recognize ;)

> 
>> +/*
>> + * Charge the memory controller for page usage.
>> + * Return
>> + * 0 if the charge was successful
>> + * < 0 if the container is over its limit
>> + */
>> +int mem_container_charge(struct page *page, struct mm_struct *mm)
>> +{
>> +    struct mem_container *mem;
>> +    struct page_container *pc, *race_pc;
>> +
>> +    /*
>> +     * Should page_container's go to their own slab?
>> +     * One could optimize the performance of the charging routine
>> +     * by saving a bit in the page_flags and using it as a lock
>> +     * to see if the container page already has a page_container
>> associated
>> +     * with it
>> +     */
>> +    lock_page_container(page);
>> +    pc = page_get_page_container(page);
>> +    /*
>> +     * The page_container exists and the page has already been accounted
>> +     */
>> +    if (pc) {
>> +        atomic_inc(&pc->ref_cnt);
>> +        goto done;
>> +    }
>> +
>> +    unlock_page_container(page);
>> +
>> +    pc = kzalloc(sizeof(struct page_container), GFP_KERNEL);
>> +    if (pc == NULL)
>> +        goto err;
>> +
>> +    rcu_read_lock();
>> +    /*
>> +     * We always charge the container the mm_struct belongs to
>> +     * the mm_struct's mem_container changes on task migration if the
>> +     * thread group leader migrates. It's possible that mm is not
>> +     * set, if so charge the init_mm (happens for pagecache usage).
>> +     */
>> +    if (!mm)
>> +        mm = &init_mm;
>> +
>> +    mem = rcu_dereference(mm->mem_container);
>> +    /*
>> +     * For every charge from the container, increment reference
>> +     * count
>> +     */
>> +    css_get(&mem->css);
>> +    rcu_read_unlock();
> 
> Where's the corresponding rcu_assign_pointer? Oh, in the next patch. That's
> unconventional (can you rearrange it so they go in together, please?).
> 
> 

The movement is associated with task migration, it should be easy to move
the patch around.

>> @@ -1629,6 +1637,9 @@ gotten:
>>          goto oom;
>>      cow_user_page(new_page, old_page, address, vma);
>>  
>> +    if (mem_container_charge(new_page, mm))
>> +        goto oom_free_new;
>> +
>>      /*
>>       * Re-check the pte - we dropped the lock
>>       */
>> @@ -1660,7 +1671,9 @@ gotten:
>>          /* Free the old page.. */
>>          new_page = old_page;
>>          ret |= VM_FAULT_WRITE;
>> -    }
>> +    } else
>> +        mem_container_uncharge_page(new_page);
>> +
>>      if (new_page)
>>          page_cache_release(new_page);
>>      if (old_page)
>> @@ -1681,6 +1694,8 @@ unlock:
>>          put_page(dirty_page);
>>      }
>>      return ret;
>> +oom_free_new:
>> +    __free_page(new_page);
>>  oom:
>>      if (old_page)
>>          page_cache_release(old_page);
>> @@ -2085,6 +2100,11 @@ static int do_swap_page(struct mm_struct
>>      }
>>  
>>      delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
>> +    if (mem_container_charge(page, mm)) {
>> +        ret = VM_FAULT_OOM;
>> +        goto out;
>> +    }
> 
> The oom-from-pagefault path is quite crap (not this code, mainline).
> It doesn't obey any of the given oom killing policy.
> 
> I had a patch to fix this, but nobody else was too concerned at that
> stage. I don't expect that path would have been hit very much before,
> but with a lot of -EFAULTs coming out of containers, I think
> VM_FAULT_OOMs could easily trigger in practice now.
> 
> http://lkml.org/lkml/2006/10/12/158
> 
> Patch may be a little outdated, but the basics should still work. You
> might be in a good position to test it with these container patches?
> 


I'll test the patch. Thanks for pointing me in the right direction.

> 
>> diff -puN mm/rmap.c~memory-controller-memory-accounting-v7 mm/rmap.c
>> --- a/mm/rmap.c~memory-controller-memory-accounting-v7
>> +++ a/mm/rmap.c
>> @@ -48,6 +48,7 @@
>>  #include <linux/rcupdate.h>
>>  #include <linux/module.h>
>>  #include <linux/kallsyms.h>
>> +#include <linux/memcontrol.h>
>>  
>>  #include <asm/tlbflush.h>
>>  
>> @@ -550,8 +551,14 @@ void page_add_anon_rmap(struct page *pag
>>      VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
>>      if (atomic_inc_and_test(&page->_mapcount))
>>          __page_set_anon_rmap(page, vma, address);
>> -    else
>> +    else {
>>          __page_check_anon_rmap(page, vma, address);
>> +        /*
>> +         * We unconditionally charged during prepare, we uncharge here
>> +         * This takes care of balancing the reference counts
>> +         */
>> +        mem_container_uncharge_page(page);
>> +    }
>>  }
>>  
>>  /*
>> @@ -582,6 +589,12 @@ void page_add_file_rmap(struct page *pag
>>  {
>>      if (atomic_inc_and_test(&page->_mapcount))
>>          __inc_zone_page_state(page, NR_FILE_MAPPED);
>> +    else
>> +        /*
>> +         * We unconditionally charged during prepare, we uncharge here
>> +         * This takes care of balancing the reference counts
>> +         */
>> +        mem_container_uncharge_page(page);
>>  }
> 
> What's "during prepare"? Better would be "before adding the page to
> page tables" or something.
> 

Yeah.. I'll change that comment

> But... why do you do it? The refcounts and charging are alrady taken
> care of in your charge function, aren't they? Just unconditionally
> charge at map time and unconditionally uncharge at unmap time, and
> let your accounting implementation take care of what to actually do.
> 
> (This is what I mean about mem container creeping into core code --
> it's fine to have some tasteful hooks, but introducing these more
> complex interactions between core VM and container accounting details
> is nasty).
> 
> I would much prefer this whole thing to _not_ to hook into rmap like
> this at all. Do the call unconditionally, and your container
> implementation can do as much weird and wonderful refcounting as its
> heart desires.
> 

The reason why we have the accounting this way is because

After we charge, some other code path could fail and we need
to uncharge the page. We do most of the refcounting internally.
mem_container_charge() could fail if the container is over
its limit and we cannot reclaim enough pages to allow a new
charge to be added. In that case we go to OOM.


> 
>>  #ifdef CONFIG_DEBUG_VM
>> @@ -642,6 +655,8 @@ void page_remove_rmap(struct page *page,
>>              page_clear_dirty(page);
>>              set_page_dirty(page);
>>          }
>> +        mem_container_uncharge_page(page);
>> +
>>          __dec_zone_page_state(page,
>>                  PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
>>      }
>> diff -puN mm/swap_state.c~memory-controller-memory-accounting-v7
>> mm/swap_state.c
>> --- a/mm/swap_state.c~memory-controller-memory-accounting-v7
>> +++ a/mm/swap_state.c
>> @@ -17,6 +17,7 @@
>>  #include <linux/backing-dev.h>
>>  #include <linux/pagevec.h>
>>  #include <linux/migrate.h>
>> +#include <linux/memcontrol.h>
>>  
>>  #include <asm/pgtable.h>
>>  
>> @@ -80,6 +81,11 @@ static int __add_to_swap_cache(struct pa
>>      BUG_ON(PagePrivate(page));
>>      error = radix_tree_preload(gfp_mask);
>>      if (!error) {
>> +
>> +        error = mem_container_charge(page, current->mm);
>> +        if (error)
>> +            goto out;
>> +
>>          write_lock_irq(&swapper_space.tree_lock);
>>          error = radix_tree_insert(&swapper_space.page_tree,
>>                          entry.val, page);
>> @@ -89,10 +95,13 @@ static int __add_to_swap_cache(struct pa
>>              set_page_private(page, entry.val);
>>              total_swapcache_pages++;
>>              __inc_zone_page_state(page, NR_FILE_PAGES);
>> -        }
>> +        } else
>> +            mem_container_uncharge_page(page);
>> +
>>          write_unlock_irq(&swapper_space.tree_lock);
>>          radix_tree_preload_end();
>>      }
>> +out:
>>      return error;
>>  }
> 
> Uh. You have to be really careful here (and in add_to_page_cache, and
> possibly others I haven't really looked)... you're ignoring gfp masks
> because mem_container_charge allocates with GFP_KERNEL. You can have
> all sorts of GFP_ATOMIC, GFP_NOIO, NOFS etc. come in these places that
> will deadlock.
> 

I ran into some trouble with move_to_swap_cache() and move_from_swap_cache()
since they use GFP_ATOMIC. Given the page is already accounted for and
refcounted, the refcounting helped avoid blocking.

> I think I would much prefer you make mem_container_charge run atomically,
> and then have just a single hook at the site of where everything else is
> checked, rather than having all this pre-charge post-uncharge messiness.
> Ditto for mm/memory.c... all call sites really... should make them much
> nicer looking.
> 

The problem is that we need to charge, before we add the page to the page
table and that needs to be a blocking context (since we reclaim when
the container goes over limit). The uncharge part comes in when we
handle errors from other paths (like I've mentioned earlier in this
email).

> 
> I didn't look at all code in all patches. But overall I guess it isn't
> looking too bad (though I'd really like people like Andrea and Hugh to
> take a look as well, eventually). It is still a lot uglier in terms of
> core mm hooks than I would like, but with luck that could be improved
> (some of my suggestions might even help a bit).
> 

I would like Andrea and Hugh to review it as well. Eventually, we want
the best thing to go in and make it as non-intrusive as possible.


> Haven't looked at the targetted reclaim patches yet (sigh, more
> container tentacles :P).
> 

:-)

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-28  7:26   ` Balbir Singh
@ 2007-08-28  9:29     ` Nick Piggin
  2007-08-28 11:39       ` Balbir Singh
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2007-08-28  9:29 UTC (permalink / raw)
  To: balbir
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Balbir Singh wrote:
> Nick Piggin wrote:
> 
>>akpm@linux-foundation.org wrote:
>>
>>>The patch titled
>>>     Memory controller: memory accounting
>>
>>What's the plans to deal with other types of pages other than pagecache?
>>Surely not adding hooks on a case-by-case basis throughout all the kernel?
>>Or do we not actually care about giving real guarantees about memory
>>availability? Presumably you do, otherwise you wouldn't be carefully
>>charging things before you actually insert them and uncharing afterwards
>>on failure, etc.
>>
> 
> 
> We deal with RSS and Page/Swap Cache in this controller.

Sure. And if all you intend is workload management, then that's probably
fine. If you want guarantees, then its useless on its own.


>>If you do, then I would have thought the best way to go would be to start
>>with the simplest approach, and add things like targetted reclaim after
>>that.
>>
> 
> 
> I think the kernel accounting/control system will have to be an independent
> controller (since we cannot reclaim kernel pages, just limit them by
> accounting (except for some slab pages which are free)).

I don't see why accounting would have to be different.

Control obviously will, but for the purposes of using a page, it doesn't
matter to anybody else in the system whether some container has used it
for a pagecache page, or for something else like page tables. The end
result obviously is only that you can either kill the container or
reclaim its reclaimablememory, but before you get to that point, you
need to do the actual  accounting, and AFAIKS it would be identical for
all pages (at least for this basic first-touch charging scheme).


>>I just don't want to see little bits to add more and more hooks slowly go
>>in under the radar ;) So... there is a coherent plan, right?
>>
> 
> 
> Nothing will go under the radar :-) Everything will be posted to linux-mm
> and linux-kernel. We'll make sure you don't wake up one day and see
> the mm code has changed to something you cannot recognize ;)

I don't mean to say it would be done on purpose, but without a coherent
strategy then things might slowly just get added as people think they are
needed. I'm fairly sure several people want to really guarantee memory
resources in an untrusted environment, don't they? And that's obviously
not going to scale by putting calls all throughout the kernel.


>>>+    mem = rcu_dereference(mm->mem_container);
>>>+    /*
>>>+     * For every charge from the container, increment reference
>>>+     * count
>>>+     */
>>>+    css_get(&mem->css);
>>>+    rcu_read_unlock();
>>
>>Where's the corresponding rcu_assign_pointer? Oh, in the next patch. That's
>>unconventional (can you rearrange it so they go in together, please?).
>>
>>
> 
> 
> The movement is associated with task migration, it should be easy to move
> the patch around.

Thanks.


>>>+    if (mem_container_charge(page, mm)) {
>>>+        ret = VM_FAULT_OOM;
>>>+        goto out;
>>>+    }
>>
>>The oom-from-pagefault path is quite crap (not this code, mainline).
>>It doesn't obey any of the given oom killing policy.
>>
>>I had a patch to fix this, but nobody else was too concerned at that
>>stage. I don't expect that path would have been hit very much before,
>>but with a lot of -EFAULTs coming out of containers, I think
>>VM_FAULT_OOMs could easily trigger in practice now.
>>
>>http://lkml.org/lkml/2006/10/12/158
>>
>>Patch may be a little outdated, but the basics should still work. You
>>might be in a good position to test it with these container patches?
>>
> 
> 
> 
> I'll test the patch. Thanks for pointing me in the right direction.

OK, thanks. Let me know how it goes and we could try again to get it
merged.


>>>@@ -582,6 +589,12 @@ void page_add_file_rmap(struct page *pag
>>> {
>>>     if (atomic_inc_and_test(&page->_mapcount))
>>>         __inc_zone_page_state(page, NR_FILE_MAPPED);
>>>+    else
>>>+        /*
>>>+         * We unconditionally charged during prepare, we uncharge here
>>>+         * This takes care of balancing the reference counts
>>>+         */
>>>+        mem_container_uncharge_page(page);
>>> }
>>
>>What's "during prepare"? Better would be "before adding the page to
>>page tables" or something.
>>
> 
> 
> Yeah.. I'll change that comment
> 
> 
>>But... why do you do it? The refcounts and charging are alrady taken
>>care of in your charge function, aren't they? Just unconditionally
>>charge at map time and unconditionally uncharge at unmap time, and
>>let your accounting implementation take care of what to actually do.
>>
>>(This is what I mean about mem container creeping into core code --
>>it's fine to have some tasteful hooks, but introducing these more
>>complex interactions between core VM and container accounting details
>>is nasty).
>>
>>I would much prefer this whole thing to _not_ to hook into rmap like
>>this at all. Do the call unconditionally, and your container
>>implementation can do as much weird and wonderful refcounting as its
>>heart desires.
>>
> 
> 
> The reason why we have the accounting this way is because
> 
> After we charge, some other code path could fail and we need
> to uncharge the page. We do most of the refcounting internally.
> mem_container_charge() could fail if the container is over
> its limit and we cannot reclaim enough pages to allow a new
> charge to be added. In that case we go to OOM.

But at this point you have already charged the container, and have put
it in the page tables, if I read correctly. Nothing is going to fail
at this point and the page could get uncharged when it is unmapped?


>>>@@ -80,6 +81,11 @@ static int __add_to_swap_cache(struct pa
>>>     BUG_ON(PagePrivate(page));
>>>     error = radix_tree_preload(gfp_mask);
>>>     if (!error) {
>>>+
>>>+        error = mem_container_charge(page, current->mm);
>>>+        if (error)
>>>+            goto out;
>>>+
>>>         write_lock_irq(&swapper_space.tree_lock);
>>>         error = radix_tree_insert(&swapper_space.page_tree,
>>>                         entry.val, page);
>>>@@ -89,10 +95,13 @@ static int __add_to_swap_cache(struct pa
>>>             set_page_private(page, entry.val);
>>>             total_swapcache_pages++;
>>>             __inc_zone_page_state(page, NR_FILE_PAGES);
>>>-        }
>>>+        } else
>>>+            mem_container_uncharge_page(page);
>>>+
>>>         write_unlock_irq(&swapper_space.tree_lock);
>>>         radix_tree_preload_end();
>>>     }
>>>+out:
>>>     return error;
>>> }
>>
>>Uh. You have to be really careful here (and in add_to_page_cache, and
>>possibly others I haven't really looked)... you're ignoring gfp masks
>>because mem_container_charge allocates with GFP_KERNEL. You can have
>>all sorts of GFP_ATOMIC, GFP_NOIO, NOFS etc. come in these places that
>>will deadlock.
>>
> 
> 
> I ran into some trouble with move_to_swap_cache() and move_from_swap_cache()
> since they use GFP_ATOMIC. Given the page is already accounted for and
> refcounted, the refcounting helped avoid blocking.

add_to_page_cache gets called with GFP_ATOMIC as well, and it gets called
with GFP_NOFS for new pages.

But I don't think you were suggesting that this isn't a problem, where
you? Relying on implementation in the VM would signal more broken layering.


>>I think I would much prefer you make mem_container_charge run atomically,
>>and then have just a single hook at the site of where everything else is
>>checked, rather than having all this pre-charge post-uncharge messiness.
>>Ditto for mm/memory.c... all call sites really... should make them much
>>nicer looking.
>>
> 
> 
> The problem is that we need to charge, before we add the page to the page
> table and that needs to be a blocking context (since we reclaim when
> the container goes over limit). The uncharge part comes in when we
> handle errors from other paths (like I've mentioned earlier in this
> email).

So I don't understand how you'd deal with GFP_ATOMIC and such restricted
masks, then, if you want to block here.

It would be so so much easier and cleaner for the VM if you did all the
accounting in page alloc and freeing hooks, and just put the page
on per-container LRUs when it goes on the regular LRU.

I'm still going to keep pushing for that approach until either someone
explains why it can't be done or the current patch gets a fair bit
cleaner. Has that already been tried and shown not to work? I would have
thought so seeing as it would be the simplest patch, however I can't
remember hearing about the actual problems with it.

But anyway I don't have any problems with it getting into -mm at this
stage for more exposure and review. Maybe my concerns will get
overridden anyway ;)

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-28  9:29     ` Nick Piggin
@ 2007-08-28 11:39       ` Balbir Singh
  2007-08-29  7:28         ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Balbir Singh @ 2007-08-28 11:39 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Nick Piggin wrote:
> Balbir Singh wrote:
>> Nick Piggin wrote:
>>
>>> akpm@linux-foundation.org wrote:
>>>
>>>> The patch titled
>>>>     Memory controller: memory accounting
>>>
>>> What's the plans to deal with other types of pages other than pagecache?
>>> Surely not adding hooks on a case-by-case basis throughout all the
>>> kernel?
>>> Or do we not actually care about giving real guarantees about memory
>>> availability? Presumably you do, otherwise you wouldn't be carefully
>>> charging things before you actually insert them and uncharing afterwards
>>> on failure, etc.
>>>
>>
>>
>> We deal with RSS and Page/Swap Cache in this controller.
> 
> Sure. And if all you intend is workload management, then that's probably
> fine. If you want guarantees, then its useless on its own.
> 

Yes. Guarantees are hard to do with just this controller (due to non-reclaimable
pages). It should be easy to add a mlock() controller on top of this one.
Pavel is planning to work on a kernel memory controller. With that in place,
guarantees might be possible.

> 
>>> If you do, then I would have thought the best way to go would be to
>>> start
>>> with the simplest approach, and add things like targetted reclaim after
>>> that.
>>>
>>
>>
>> I think the kernel accounting/control system will have to be an
>> independent
>> controller (since we cannot reclaim kernel pages, just limit them by
>> accounting (except for some slab pages which are free)).
> 
> I don't see why accounting would have to be different.
> 
> Control obviously will, but for the purposes of using a page, it doesn't
> matter to anybody else in the system whether some container has used it
> for a pagecache page, or for something else like page tables. The end
> result obviously is only that you can either kill the container or
> reclaim its reclaimablememory, but before you get to that point, you
> need to do the actual  accounting, and AFAIKS it would be identical for
> all pages (at least for this basic first-touch charging scheme).
> 


Yes.. I see your point

> 
>>> I just don't want to see little bits to add more and more hooks
>>> slowly go
>>> in under the radar ;) So... there is a coherent plan, right?
>>>
>>
>>
>> Nothing will go under the radar :-) Everything will be posted to linux-mm
>> and linux-kernel. We'll make sure you don't wake up one day and see
>> the mm code has changed to something you cannot recognize ;)
> 
> I don't mean to say it would be done on purpose, but without a coherent
> strategy then things might slowly just get added as people think they are
> needed. I'm fairly sure several people want to really guarantee memory
> resources in an untrusted environment, don't they? And that's obviously
> not going to scale by putting calls all throughout the kernel.
> 

We sent out an RFC last year and got several comments from stake holders

1. Pavel Emelianov
2. Paul Menage
3. Vaidyanathan Srinivasan
4. YAMAMOTO Takshi
5. KAMEZAWA Hiroyuki


At the OLS resource management BoF, we had a broader participation and
several comments and suggestions.

We've incorporated suggestions and comments from all stake holders

> 
>>>> +    mem = rcu_dereference(mm->mem_container);
>>>> +    /*
>>>> +     * For every charge from the container, increment reference
>>>> +     * count
>>>> +     */
>>>> +    css_get(&mem->css);
>>>> +    rcu_read_unlock();
>>>
>>> Where's the corresponding rcu_assign_pointer? Oh, in the next patch.
>>> That's
>>> unconventional (can you rearrange it so they go in together, please?).
>>>
>>>
>>
>>
>> The movement is associated with task migration, it should be easy to move
>> the patch around.
> 
> Thanks.
> 
> 
>>>> +    if (mem_container_charge(page, mm)) {
>>>> +        ret = VM_FAULT_OOM;
>>>> +        goto out;
>>>> +    }
>>>
>>> The oom-from-pagefault path is quite crap (not this code, mainline).
>>> It doesn't obey any of the given oom killing policy.
>>>
>>> I had a patch to fix this, but nobody else was too concerned at that
>>> stage. I don't expect that path would have been hit very much before,
>>> but with a lot of -EFAULTs coming out of containers, I think
>>> VM_FAULT_OOMs could easily trigger in practice now.
>>>
>>> http://lkml.org/lkml/2006/10/12/158
>>>
>>> Patch may be a little outdated, but the basics should still work. You
>>> might be in a good position to test it with these container patches?
>>>
>>
>>
>>
>> I'll test the patch. Thanks for pointing me in the right direction.
> 
> OK, thanks. Let me know how it goes and we could try again to get it
> merged.
> 
> 

Sure I'll start testing out that code.

>>>> @@ -582,6 +589,12 @@ void page_add_file_rmap(struct page *pag
>>>> {
>>>>     if (atomic_inc_and_test(&page->_mapcount))
>>>>         __inc_zone_page_state(page, NR_FILE_MAPPED);
>>>> +    else
>>>> +        /*
>>>> +         * We unconditionally charged during prepare, we uncharge here
>>>> +         * This takes care of balancing the reference counts
>>>> +         */
>>>> +        mem_container_uncharge_page(page);
>>>> }
>>>
>>> What's "during prepare"? Better would be "before adding the page to
>>> page tables" or something.
>>>
>>
>>
>> Yeah.. I'll change that comment
>>
>>
>>> But... why do you do it? The refcounts and charging are alrady taken
>>> care of in your charge function, aren't they? Just unconditionally
>>> charge at map time and unconditionally uncharge at unmap time, and
>>> let your accounting implementation take care of what to actually do.
>>>
>>> (This is what I mean about mem container creeping into core code --
>>> it's fine to have some tasteful hooks, but introducing these more
>>> complex interactions between core VM and container accounting details
>>> is nasty).
>>>
>>> I would much prefer this whole thing to _not_ to hook into rmap like
>>> this at all. Do the call unconditionally, and your container
>>> implementation can do as much weird and wonderful refcounting as its
>>> heart desires.
>>>
>>
>>
>> The reason why we have the accounting this way is because
>>
>> After we charge, some other code path could fail and we need
>> to uncharge the page. We do most of the refcounting internally.
>> mem_container_charge() could fail if the container is over
>> its limit and we cannot reclaim enough pages to allow a new
>> charge to be added. In that case we go to OOM.
> 
> But at this point you have already charged the container, and have put
> it in the page tables, if I read correctly. Nothing is going to fail
> at this point and the page could get uncharged when it is unmapped?
> 
> 

Here's the sequence of steps

1. Charge (we might need to block on reclaim)
2. Check to see if there is a race in updating the PTE
3. Add to the page table, update _mapcount under a lock

Several times the error occurs in step 2, due to which we need to uncharge
the memory container.

>>>> @@ -80,6 +81,11 @@ static int __add_to_swap_cache(struct pa
>>>>     BUG_ON(PagePrivate(page));
>>>>     error = radix_tree_preload(gfp_mask);
>>>>     if (!error) {
>>>> +
>>>> +        error = mem_container_charge(page, current->mm);
>>>> +        if (error)
>>>> +            goto out;
>>>> +
>>>>         write_lock_irq(&swapper_space.tree_lock);
>>>>         error = radix_tree_insert(&swapper_space.page_tree,
>>>>                         entry.val, page);
>>>> @@ -89,10 +95,13 @@ static int __add_to_swap_cache(struct pa
>>>>             set_page_private(page, entry.val);
>>>>             total_swapcache_pages++;
>>>>             __inc_zone_page_state(page, NR_FILE_PAGES);
>>>> -        }
>>>> +        } else
>>>> +            mem_container_uncharge_page(page);
>>>> +
>>>>         write_unlock_irq(&swapper_space.tree_lock);
>>>>         radix_tree_preload_end();
>>>>     }
>>>> +out:
>>>>     return error;
>>>> }
>>>
>>> Uh. You have to be really careful here (and in add_to_page_cache, and
>>> possibly others I haven't really looked)... you're ignoring gfp masks
>>> because mem_container_charge allocates with GFP_KERNEL. You can have
>>> all sorts of GFP_ATOMIC, GFP_NOIO, NOFS etc. come in these places that
>>> will deadlock.
>>>
>>
>>
>> I ran into some trouble with move_to_swap_cache() and
>> move_from_swap_cache()
>> since they use GFP_ATOMIC. Given the page is already accounted for and
>> refcounted, the refcounting helped avoid blocking.
> 
> add_to_page_cache gets called with GFP_ATOMIC as well, and it gets called
> with GFP_NOFS for new pages.
> 
> But I don't think you were suggesting that this isn't a problem, where
> you? Relying on implementation in the VM would signal more broken layering.
> 
> 

Good, thanks for spotting this. It should be possible to fix this case.
I'll work on a fix and send it out.

>>> I think I would much prefer you make mem_container_charge run
>>> atomically,
>>> and then have just a single hook at the site of where everything else is
>>> checked, rather than having all this pre-charge post-uncharge messiness.
>>> Ditto for mm/memory.c... all call sites really... should make them much
>>> nicer looking.
>>>
>>
>>
>> The problem is that we need to charge, before we add the page to the page
>> table and that needs to be a blocking context (since we reclaim when
>> the container goes over limit). The uncharge part comes in when we
>> handle errors from other paths (like I've mentioned earlier in this
>> email).
> 
> So I don't understand how you'd deal with GFP_ATOMIC and such restricted
> masks, then, if you want to block here.
> 
> It would be so so much easier and cleaner for the VM if you did all the
> accounting in page alloc and freeing hooks, and just put the page
> on per-container LRUs when it goes on the regular LRU.
> 

The advantage of the current approach is that not all page allocations have
to worry about charges. Only user space pages (unmapped cache and mapped RSS)
are accounted for.

We tried syncing the container and the global LRU. Pavel is working on
that approach. The problem with adding the page at the same time as the
regular LRU is that we end up calling the addition under the zone->lru_lock.
Holding the entire zone's lru_lock for list addition might be an overhead.
Although, we do the list isolation under the zone's lru lock.

> I'm still going to keep pushing for that approach until either someone
> explains why it can't be done or the current patch gets a fair bit
> cleaner. Has that already been tried and shown not to work? I would have
> thought so seeing as it would be the simplest patch, however I can't
> remember hearing about the actual problems with it.
> 

I am all eyes and ears to patches/suggestions/improvements to the current
container.

> But anyway I don't have any problems with it getting into -mm at this
> stage for more exposure and review. Maybe my concerns will get
> overridden anyway ;)
> 

Thanks, we'll do the right thing to override your concerns (whether that
is to change code or to convince you that we are already doing it
right)


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-28 11:39       ` Balbir Singh
@ 2007-08-29  7:28         ` Nick Piggin
  2007-08-29  8:15           ` Balbir Singh
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2007-08-29  7:28 UTC (permalink / raw)
  To: balbir
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Balbir Singh wrote:
> Nick Piggin wrote:

>>Sure. And if all you intend is workload management, then that's probably
>>fine. If you want guarantees, then its useless on its own.
>>
> 
> 
> Yes. Guarantees are hard to do with just this controller (due to non-reclaimable
> pages). It should be easy to add a mlock() controller on top of this one.
> Pavel is planning to work on a kernel memory controller. With that in place,
> guarantees might be possible.

So it doesn't sound like they are necessarily a requirement for linux
kernel memory control -- that's fine, I just want to know where you
stand with that.

>>I don't mean to say it would be done on purpose, but without a coherent
>>strategy then things might slowly just get added as people think they are
>>needed. I'm fairly sure several people want to really guarantee memory
>>resources in an untrusted environment, don't they? And that's obviously
>>not going to scale by putting calls all throughout the kernel.
>>
> 
> 
> We sent out an RFC last year and got several comments from stake holders
> 
> 1. Pavel Emelianov
> 2. Paul Menage
> 3. Vaidyanathan Srinivasan
> 4. YAMAMOTO Takshi
> 5. KAMEZAWA Hiroyuki
> 
> 
> At the OLS resource management BoF, we had a broader participation and
> several comments and suggestions.
> 
> We've incorporated suggestions and comments from all stake holders

:) I know everyone's really worked hard at this and is trying to do the
right thing. But for example some of the non-container VM people may not
know exactly what you are up to or planning (I don't). It can make things
a bit harder to review IF there is an expectation that more functionality
is going to be required (which is what my expectation is).

Anyway, let's not continue this tangent. Instead, I'll try to be more
constructive and ask for eg. your future plans if they are not clear.

>>But at this point you have already charged the container, and have put
>>it in the page tables, if I read correctly. Nothing is going to fail
>>at this point and the page could get uncharged when it is unmapped?
>>
>>
> 
> 
> Here's the sequence of steps
> 
> 1. Charge (we might need to block on reclaim)
> 2. Check to see if there is a race in updating the PTE
> 3. Add to the page table, update _mapcount under a lock
> 
> Several times the error occurs in step 2, due to which we need to uncharge
> the memory container.

Oh yeah, I understand all _those_ uncharging calls you do. But let me
just quote the code again:

diff -puN mm/rmap.c~memory-controller-memory-accounting-v7 mm/rmap.c
--- a/mm/rmap.c~memory-controller-memory-accounting-v7
+++ a/mm/rmap.c
@@ -48,6 +48,7 @@
  #include <linux/rcupdate.h>
  #include <linux/module.h>
  #include <linux/kallsyms.h>
+#include <linux/memcontrol.h>

  #include <asm/tlbflush.h>

@@ -550,8 +551,14 @@ void page_add_anon_rmap(struct page *pag
      VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
      if (atomic_inc_and_test(&page->_mapcount))
          __page_set_anon_rmap(page, vma, address);
-    else
+    else {
          __page_check_anon_rmap(page, vma, address);
+        /*
+         * We unconditionally charged during prepare, we uncharge here
+         * This takes care of balancing the reference counts
+         */
+        mem_container_uncharge_page(page);
+    }
  }

At the point you are uncharging here, the pte has been updated and
the page is in the pagetables of the process. I guess this uncharging
works on the presumption that you are not the first container to map
the page, but I thought that you already check for that in your
accounting implementation.

Now how does it take care of the refcounts? I guess it is because your
rmap removal function also takes care of refcounts by only uncharging
if mapcount has gone to zero... however that's polluting the VM with
knowledge that your accounting scheme is a first-touch one, isn't it?

Aside, I'm slightly suspicious of whether this is correct against
mapcount races, but I didn't look closely yet. I remember you bringing
that up with me, so I guess you've been careful there...

>>add_to_page_cache gets called with GFP_ATOMIC as well, and it gets called
>>with GFP_NOFS for new pages.
>>
>>But I don't think you were suggesting that this isn't a problem, where
>>you? Relying on implementation in the VM would signal more broken layering.
>>
>>
> 
> 
> Good, thanks for spotting this. It should be possible to fix this case.
> I'll work on a fix and send it out.

OK.

>>It would be so so much easier and cleaner for the VM if you did all the
>>accounting in page alloc and freeing hooks, and just put the page
>>on per-container LRUs when it goes on the regular LRU.
>>
> 
> 
> The advantage of the current approach is that not all page allocations have
> to worry about charges. Only user space pages (unmapped cache and mapped RSS)
> are accounted for.

That could be true. OTOH, if you have a significant amount of non userspace
page allocations happening, the chances are that your userspace-only
accounting is going out the window too :)

You'd still be able to avoid charging if a process isn't in a container
or accounting is turned off, of course.

> We tried syncing the container and the global LRU. Pavel is working on
> that approach. The problem with adding the page at the same time as the
> regular LRU is that we end up calling the addition under the zone->lru_lock.
> Holding the entire zone's lru_lock for list addition might be an overhead.
> Although, we do the list isolation under the zone's lru lock.

Well it may not have to be _exactly_ the same time as the page goes on the
normal lru. You could try doing it in pagevec_lru_add, for example, after
the lru locks have been released?

>>I'm still going to keep pushing for that approach until either someone
>>explains why it can't be done or the current patch gets a fair bit
>>cleaner. Has that already been tried and shown not to work? I would have
>>thought so seeing as it would be the simplest patch, however I can't
>>remember hearing about the actual problems with it.
>>
> 
> 
> I am all eyes and ears to patches/suggestions/improvements to the current
> container.

Well I have made a few. I'm actually not too interested in containers and
resource control myself, but I suspect it may be a fair bit easier to play
around with eg. my ideas for VM hooks with an implementation in -mm, so I
might get motivated to try a patch...

Thanks,
Nick

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-29  7:28         ` Nick Piggin
@ 2007-08-29  8:15           ` Balbir Singh
  2007-08-30  7:39             ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Balbir Singh @ 2007-08-29  8:15 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Nick Piggin wrote:
> Balbir Singh wrote:
>> Nick Piggin wrote:
> 
>>> Sure. And if all you intend is workload management, then that's probably
>>> fine. If you want guarantees, then its useless on its own.
>>>
>>
>>
>> Yes. Guarantees are hard to do with just this controller (due to
>> non-reclaimable
>> pages). It should be easy to add a mlock() controller on top of this one.
>> Pavel is planning to work on a kernel memory controller. With that in
>> place,
>> guarantees might be possible.
> 
> So it doesn't sound like they are necessarily a requirement for linux
> kernel memory control -- that's fine, I just want to know where you
> stand with that.
> 
> 
>>> I don't mean to say it would be done on purpose, but without a coherent
>>> strategy then things might slowly just get added as people think they
>>> are
>>> needed. I'm fairly sure several people want to really guarantee memory
>>> resources in an untrusted environment, don't they? And that's obviously
>>> not going to scale by putting calls all throughout the kernel.
>>>
>>
>>
>> We sent out an RFC last year and got several comments from stake holders
>>
>> 1. Pavel Emelianov
>> 2. Paul Menage
>> 3. Vaidyanathan Srinivasan
>> 4. YAMAMOTO Takshi
>> 5. KAMEZAWA Hiroyuki
>>
>>
>> At the OLS resource management BoF, we had a broader participation and
>> several comments and suggestions.
>>
>> We've incorporated suggestions and comments from all stake holders
> 
> :) I know everyone's really worked hard at this and is trying to do the
> right thing. But for example some of the non-container VM people may not
> know exactly what you are up to or planning (I don't). It can make things
> a bit harder to review IF there is an expectation that more functionality
> is going to be required (which is what my expectation is).
> 
> Anyway, let's not continue this tangent. Instead, I'll try to be more
> constructive and ask for eg. your future plans if they are not clear.
> 

Thanks, I am going to be present at the VM summit and we have a roadmap
as well. We can discuss it and we'll also send it out for comments.
Just FYI, we started with an RFC about a year ago and got several
comments from non-container folks. We'll try to discuss the roadmap
further and get in more inputs.

> 
>>> But at this point you have already charged the container, and have put
>>> it in the page tables, if I read correctly. Nothing is going to fail
>>> at this point and the page could get uncharged when it is unmapped?
>>>
>>>
>>
>>
>> Here's the sequence of steps
>>
>> 1. Charge (we might need to block on reclaim)
>> 2. Check to see if there is a race in updating the PTE
>> 3. Add to the page table, update _mapcount under a lock
>>
>> Several times the error occurs in step 2, due to which we need to
>> uncharge
>> the memory container.
> 
> Oh yeah, I understand all _those_ uncharging calls you do. But let me
> just quote the code again:
> 
> diff -puN mm/rmap.c~memory-controller-memory-accounting-v7 mm/rmap.c
> --- a/mm/rmap.c~memory-controller-memory-accounting-v7
> +++ a/mm/rmap.c
> @@ -48,6 +48,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/module.h>
>  #include <linux/kallsyms.h>
> +#include <linux/memcontrol.h>
> 
>  #include <asm/tlbflush.h>
> 
> @@ -550,8 +551,14 @@ void page_add_anon_rmap(struct page *pag
>      VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
>      if (atomic_inc_and_test(&page->_mapcount))
>          __page_set_anon_rmap(page, vma, address);
> -    else
> +    else {
>          __page_check_anon_rmap(page, vma, address);
> +        /*
> +         * We unconditionally charged during prepare, we uncharge here
> +         * This takes care of balancing the reference counts
> +         */
> +        mem_container_uncharge_page(page);
> +    }
>  }
> 
> At the point you are uncharging here, the pte has been updated and
> the page is in the pagetables of the process. I guess this uncharging
> works on the presumption that you are not the first container to map
> the page, but I thought that you already check for that in your
> accounting implementation.
> 
> Now how does it take care of the refcounts? I guess it is because your
> rmap removal function also takes care of refcounts by only uncharging
> if mapcount has gone to zero... however that's polluting the VM with
> knowledge that your accounting scheme is a first-touch one, isn't it?
> 
> Aside, I'm slightly suspicious of whether this is correct against
> mapcount races, but I didn't look closely yet. I remember you bringing
> that up with me, so I guess you've been careful there...
> 

Very good review comment. Here's what we see

1. Page comes in through page cache, we increment the reference ount
2. Page comes into rmap, we increment the refcount again
3. We race in page_add.*rmap(), the problem we have is that for
   the same page, for rmap(), step 2 would have taken place more than
   once
4. That's why we uncharge

I think I need to add a big fat comment in the ref_cnt member. ref_cnt
is held once for the page in the page cache and once for the page mapped
anywhere in the page tables.

reference counting helps us correctly determine when to take the page
of the LRU (it moves from page tables to swap cache, but it's still
on the LRU).

>>> add_to_page_cache gets called with GFP_ATOMIC as well, and it gets
>>> called
>>> with GFP_NOFS for new pages.
>>>
>>> But I don't think you were suggesting that this isn't a problem, where
>>> you? Relying on implementation in the VM would signal more broken
>>> layering.
>>>
>>>
>>
>>
>> Good, thanks for spotting this. It should be possible to fix this case.
>> I'll work on a fix and send it out.
> 
> OK.
> 
> 
>>> It would be so so much easier and cleaner for the VM if you did all the
>>> accounting in page alloc and freeing hooks, and just put the page
>>> on per-container LRUs when it goes on the regular LRU.
>>>
>>
>>
>> The advantage of the current approach is that not all page allocations
>> have
>> to worry about charges. Only user space pages (unmapped cache and
>> mapped RSS)
>> are accounted for.
> 
> That could be true. OTOH, if you have a significant amount of non userspace
> page allocations happening, the chances are that your userspace-only
> accounting is going out the window too :)
> 
> You'd still be able to avoid charging if a process isn't in a container
> or accounting is turned off, of course.
> 

Yes, we need a mechanism to turn off the container at boot time. Currently
if the config is enabled, we charge :-)

> 
>> We tried syncing the container and the global LRU. Pavel is working on
>> that approach. The problem with adding the page at the same time as the
>> regular LRU is that we end up calling the addition under the
>> zone->lru_lock.
>> Holding the entire zone's lru_lock for list addition might be an
>> overhead.
>> Although, we do the list isolation under the zone's lru lock.
> 
> Well it may not have to be _exactly_ the same time as the page goes on the
> normal lru. You could try doing it in pagevec_lru_add, for example, after
> the lru locks have been released?
> 

I'll play around with Pavel's patches or work with him to explore this
further.

> 
>>> I'm still going to keep pushing for that approach until either someone
>>> explains why it can't be done or the current patch gets a fair bit
>>> cleaner. Has that already been tried and shown not to work? I would have
>>> thought so seeing as it would be the simplest patch, however I can't
>>> remember hearing about the actual problems with it.
>>>
>>
>>
>> I am all eyes and ears to patches/suggestions/improvements to the current
>> container.
> 
> Well I have made a few. I'm actually not too interested in containers and
> resource control myself, but I suspect it may be a fair bit easier to play
> around with eg. my ideas for VM hooks with an implementation in -mm, so I
> might get motivated to try a patch...
> 

Great! Thanks!

> Thanks,
> Nick
> 


-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-29  8:15           ` Balbir Singh
@ 2007-08-30  7:39             ` Nick Piggin
  2007-08-30  9:04               ` Balbir Singh
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2007-08-30  7:39 UTC (permalink / raw)
  To: balbir
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Balbir Singh wrote:
> Nick Piggin wrote:

>>Oh yeah, I understand all _those_ uncharging calls you do. But let me
>>just quote the code again:
>>
>>diff -puN mm/rmap.c~memory-controller-memory-accounting-v7 mm/rmap.c
>>--- a/mm/rmap.c~memory-controller-memory-accounting-v7
>>+++ a/mm/rmap.c
>>@@ -48,6 +48,7 @@
>> #include <linux/rcupdate.h>
>> #include <linux/module.h>
>> #include <linux/kallsyms.h>
>>+#include <linux/memcontrol.h>
>>
>> #include <asm/tlbflush.h>
>>
>>@@ -550,8 +551,14 @@ void page_add_anon_rmap(struct page *pag
>>     VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
>>     if (atomic_inc_and_test(&page->_mapcount))
>>         __page_set_anon_rmap(page, vma, address);
>>-    else
>>+    else {
>>         __page_check_anon_rmap(page, vma, address);
>>+        /*
>>+         * We unconditionally charged during prepare, we uncharge here
>>+         * This takes care of balancing the reference counts
>>+         */
>>+        mem_container_uncharge_page(page);
>>+    }
>> }
>>
>>At the point you are uncharging here, the pte has been updated and
>>the page is in the pagetables of the process. I guess this uncharging
>>works on the presumption that you are not the first container to map
>>the page, but I thought that you already check for that in your
>>accounting implementation.
>>
>>Now how does it take care of the refcounts? I guess it is because your
>>rmap removal function also takes care of refcounts by only uncharging
>>if mapcount has gone to zero... however that's polluting the VM with
>>knowledge that your accounting scheme is a first-touch one, isn't it?
>>
>>Aside, I'm slightly suspicious of whether this is correct against
>>mapcount races, but I didn't look closely yet. I remember you bringing
>>that up with me, so I guess you've been careful there...
>>
> 
> 
> Very good review comment. Here's what we see
> 
> 1. Page comes in through page cache, we increment the reference ount
> 2. Page comes into rmap, we increment the refcount again
> 3. We race in page_add.*rmap(), the problem we have is that for
>    the same page, for rmap(), step 2 would have taken place more than
>    once
> 4. That's why we uncharge
> 
> I think I need to add a big fat comment in the ref_cnt member. ref_cnt
> is held once for the page in the page cache and once for the page mapped
> anywhere in the page tables.
> 
> reference counting helps us correctly determine when to take the page
> of the LRU (it moves from page tables to swap cache, but it's still
> on the LRU).

Still don't understand. You increment the refcount once when you put
the page in the pagecache, then again when the first process maps the
page, then again while subsequent processes map the page but you soon
drop it afterwards. That's fine, I don't pretend to understand why
you're doing it, but presumably the controller has a good reason for
that.

But my point is, why should the VM know or care about that? You should
handle all those details in your controller. If, in order to do that,
you need to differentiate between when a process puts a page in
pagecache and when it maps a page, that's fine, just use different
hooks for those events.

The situation now is that your one hook is not actually a "this page
was mapped" hook, or a "this page was added to pagecache", or "we are
about to map this page". These are easy for VM maintainers to maintain
because they're simple VM concepts.

But your hook is "increment ref_cnt and do some other stuff". So now
the VM needs to know about when and why your container implementation
needs to increment and decrement this ref_cnt. I don't know this, and
I don't want to know this ;)

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-30  7:39             ` Nick Piggin
@ 2007-08-30  9:04               ` Balbir Singh
  2007-08-31  0:35                 ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Balbir Singh @ 2007-08-30  9:04 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Nick Piggin wrote:
> Balbir Singh wrote:
>> Nick Piggin wrote:
> 
>> Very good review comment. Here's what we see
>>
>> 1. Page comes in through page cache, we increment the reference ount
>> 2. Page comes into rmap, we increment the refcount again
>> 3. We race in page_add.*rmap(), the problem we have is that for
>>    the same page, for rmap(), step 2 would have taken place more than
>>    once
>> 4. That's why we uncharge
>>
>> I think I need to add a big fat comment in the ref_cnt member. ref_cnt
>> is held once for the page in the page cache and once for the page mapped
>> anywhere in the page tables.
>>
>> reference counting helps us correctly determine when to take the page
>> of the LRU (it moves from page tables to swap cache, but it's still
>> on the LRU).
> 
> Still don't understand. You increment the refcount once when you put
> the page in the pagecache, then again when the first process maps the
> page, then again while subsequent processes map the page but you soon
> drop it afterwards. That's fine, I don't pretend to understand why
> you're doing it, but presumably the controller has a good reason for
> that.
> 
> But my point is, why should the VM know or care about that? You should
> handle all those details in your controller. If, in order to do that,
> you need to differentiate between when a process puts a page in
> pagecache and when it maps a page, that's fine, just use different
> hooks for those events.
> 
> The situation now is that your one hook is not actually a "this page
> was mapped" hook, or a "this page was added to pagecache", or "we are
> about to map this page". These are easy for VM maintainers to maintain
> because they're simple VM concepts.
> 
> But your hook is "increment ref_cnt and do some other stuff". So now
> the VM needs to know about when and why your container implementation
> needs to increment and decrement this ref_cnt. I don't know this, and
> I don't want to know this ;)
> 

My hook really is -- there was a race, there is no rmap lock to prevent
several independent processes from mapping the same page into their
page tables. I want to increment the reference count just once (apart from
it being accounted in the page cache), since we account the page once.

I'll revisit this hook to see if it can be made cleaner

Thanks for your detailed review comments.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-30  9:04               ` Balbir Singh
@ 2007-08-31  0:35                 ` Nick Piggin
  2007-08-31  4:40                   ` Balbir Singh
  0 siblings, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2007-08-31  0:35 UTC (permalink / raw)
  To: balbir
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

Balbir Singh wrote:
> Nick Piggin wrote:
> 
>>Balbir Singh wrote:
>>
>>>Nick Piggin wrote:
>>
>>>Very good review comment. Here's what we see
>>>
>>>1. Page comes in through page cache, we increment the reference ount
>>>2. Page comes into rmap, we increment the refcount again
>>>3. We race in page_add.*rmap(), the problem we have is that for
>>>   the same page, for rmap(), step 2 would have taken place more than
>>>   once
>>>4. That's why we uncharge
>>>
>>>I think I need to add a big fat comment in the ref_cnt member. ref_cnt
>>>is held once for the page in the page cache and once for the page mapped
>>>anywhere in the page tables.
>>>
>>>reference counting helps us correctly determine when to take the page
>>>of the LRU (it moves from page tables to swap cache, but it's still
>>>on the LRU).
>>
>>Still don't understand. You increment the refcount once when you put
>>the page in the pagecache, then again when the first process maps the
>>page, then again while subsequent processes map the page but you soon
>>drop it afterwards. That's fine, I don't pretend to understand why
>>you're doing it, but presumably the controller has a good reason for
>>that.
>>
>>But my point is, why should the VM know or care about that? You should
>>handle all those details in your controller. If, in order to do that,
>>you need to differentiate between when a process puts a page in
>>pagecache and when it maps a page, that's fine, just use different
>>hooks for those events.
>>
>>The situation now is that your one hook is not actually a "this page
>>was mapped" hook, or a "this page was added to pagecache", or "we are
>>about to map this page". These are easy for VM maintainers to maintain
>>because they're simple VM concepts.
>>
>>But your hook is "increment ref_cnt and do some other stuff". So now
>>the VM needs to know about when and why your container implementation
>>needs to increment and decrement this ref_cnt. I don't know this, and
>>I don't want to know this ;)
>>
> 
> 
> My hook really is -- there was a race, there is no rmap lock to prevent
> several independent processes from mapping the same page into their
> page tables. I want to increment the reference count just once (apart from
> it being accounted in the page cache), since we account the page once.
> 
> I'll revisit this hook to see if it can be made cleaner

If you just have a different hook for mapping a page into the page
tables, your controller can take care of any races, no?

-- 
SUSE Labs, Novell Inc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: + memory-controller-memory-accounting-v7.patch added to -mm tree
  2007-08-31  0:35                 ` Nick Piggin
@ 2007-08-31  4:40                   ` Balbir Singh
  0 siblings, 0 replies; 11+ messages in thread
From: Balbir Singh @ 2007-08-31  4:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: akpm, a.p.zijlstra, dev, ebiederm, herbert, menage, rientjes,
	svaidy, xemul, Linux Memory Management

<snip>
Nick Piggin wrote:
>>
>> My hook really is -- there was a race, there is no rmap lock to prevent
>> several independent processes from mapping the same page into their
>> page tables. I want to increment the reference count just once (apart
>> from
>> it being accounted in the page cache), since we account the page once.
>>
>> I'll revisit this hook to see if it can be made cleaner
> 
> If you just have a different hook for mapping a page into the page
> tables, your controller can take care of any races, no?
> 

We increment the reference count in the mem_container_charge() routine.
Not all pages into page tables, so it makes sense to do the reference
counting there. The charge routine, also does reclaim, so we cannot call
it at mapping time, since the mapping happens under pte lock.

I'll see if I can refactor the way we reference count, I agree that
it would simplify code maintenance.

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2007-08-31  4:40 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-27 21:19 + memory-controller-memory-accounting-v7.patch added to -mm tree akpm
2007-08-28  6:35 ` Nick Piggin
2007-08-28  7:26   ` Balbir Singh
2007-08-28  9:29     ` Nick Piggin
2007-08-28 11:39       ` Balbir Singh
2007-08-29  7:28         ` Nick Piggin
2007-08-29  8:15           ` Balbir Singh
2007-08-30  7:39             ` Nick Piggin
2007-08-30  9:04               ` Balbir Singh
2007-08-31  0:35                 ` Nick Piggin
2007-08-31  4:40                   ` Balbir Singh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.