[RFC/PATCH] cgroup swap subsystem

All of lore.kernel.org
 help / color / mirror / Atom feed

* [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  5:59 ` Daisuke Nishimura
  0 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-05  5:59 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg
  Cc: xemul-GEFAQzZX7r8dnm+yROfE0A,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Hi.

Even if limiting memory usage by cgroup memory subsystem
or isolating memory by cpuset, swap space is shared, so
resource isolation is not enough. If one group uses up all the
swap space, it can affect other groups.

I try making a patch of swap subsystem based on memory
subsystem, which limits swap usage per cgroup.
It can now charge and limit the swap usage.

I implemented this feature as a new subsystem,
not as a part of memory subsystem, because I don't want to
make big change to memcontrol.c, and even if implemented
as other subsystem, users can manage memory and swap on
the same cgroup directory if mount them together.

Basic idea of my implementation:
  - what will be charged ?
    the number of swap entries.

  - when to charge/uncharge ?
    charge at get_swap_entry(), and uncharge at swap_entry_free().

  - to what group charge the swap entry ?
    To determine to what swap_cgroup (corresponding to mem_cgroup in
    memory subsystem) the swap entry should be charged,
    I added a pointer to mm_struct to page_cgroup(pc->pc_mm), and
    changed the argument of get_swap_entry() from (void) to
    (struct page *). As a result, get_swap_entry() can determine
    to what swap_cgroup it should charge the swap entry
    by referring to page->page_cgroup->mm_struct->swap_cgroup.

  - from what group uncharge the swap entry ?
    I added to swap_info_struct a member 'struct swap_cgroup **',
    array of pointer to which swap_cgroup the swap entry is
    charged.

Todo:
  - rebase new kernel, and split into some patches.
  - Merge with memory subsystem (if it would be better), or
    remove dependency on CONFIG_CGROUP_MEM_CONT if possible
    (needs to make page_cgroup more generic one).
  - More tests, cleanups, and feartures   :-)  


Any comments or discussions would be appreciated.

Thanks,
Daisuke Nishimura


Signed-off-by: Daisuke Nishimura <nishimura-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org>

---
diff -uprN linux-2.6.24-mm1/include/linux/cgroup_subsys.h linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h
--- linux-2.6.24-mm1/include/linux/cgroup_subsys.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h	2008-03-03 10:56:56.000000000 +0900
@@ -42,3 +42,9 @@ SUBSYS(mem_cgroup)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+SUBSYS(swap)
+#endif
+
+/* */
diff -uprN linux-2.6.24-mm1/include/linux/memcontrol.h linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h
--- linux-2.6.24-mm1/include/linux/memcontrol.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h	2008-03-03 10:56:56.000000000 +0900
@@ -29,6 +29,21 @@ struct page;
 struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_CONT
+/*
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ */
+struct page_cgroup {
+	struct list_head lru;		/* per cgroup LRU list */
+	struct page *page;
+	struct mem_cgroup *mem_cgroup;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct mm_struct *pc_mm;
+#endif
+	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
+					/* mapped and cached states     */
+	int	 flags;
+};
 
 extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
 extern void mm_free_cgroup(struct mm_struct *mm);
diff -uprN linux-2.6.24-mm1/include/linux/mm_types.h linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h
--- linux-2.6.24-mm1/include/linux/mm_types.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h	2008-03-03 10:56:56.000000000 +0900
@@ -233,6 +233,9 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup *swap_cgroup;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff -uprN linux-2.6.24-mm1/include/linux/swap.h linux-2.6.24-mm1-swaplimit/include/linux/swap.h
--- linux-2.6.24-mm1/include/linux/swap.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/swap.h	2008-03-03 10:56:56.000000000 +0900
@@ -7,6 +7,7 @@
 #include <linux/list.h>
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
+#include <linux/swap_limit.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -141,6 +142,9 @@ struct swap_info_struct {
 	struct swap_extent *curr_swap_extent;
 	unsigned old_block_size;
 	unsigned short * swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup **swap_cgroup;
+#endif
 	unsigned int lowest_bit;
 	unsigned int highest_bit;
 	unsigned int cluster_next;
@@ -239,7 +243,7 @@ extern struct page *swapin_readahead(swp
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
 extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page(struct page *);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
@@ -342,7 +346,7 @@ static inline int remove_exclusive_swap_
 	return 0;
 }
 
-static inline swp_entry_t get_swap_page(void)
+static inline swp_entry_t get_swap_page(struct page *page)
 {
 	swp_entry_t entry;
 	entry.val = 0;
diff -uprN linux-2.6.24-mm1/include/linux/swap_limit.h linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h
--- linux-2.6.24-mm1/include/linux/swap_limit.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h	2008-03-03 10:56:56.000000000 +0900
@@ -0,0 +1,65 @@
+/*
+ * swap_limit.h
+ *
+ */
+#ifndef _LINUX_SWAP_LIMIT_H
+#define _LINUX_SWAP_LIMIT_H
+
+#include <linux/swap.h>
+#include <linux/cgroup.h>
+#include <linux/res_counter.h>
+
+struct swap_cgroup;
+struct swap_info_struct;
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+struct swap_cgroup {
+	struct cgroup_subsys_state css;
+	struct res_counter res;
+};
+
+static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
+{
+	return container_of(cgroup_subsys_state(cgrp, swap_subsys_id),
+				struct swap_cgroup,
+				css);
+}
+
+static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, swap_subsys_id),
+				struct swap_cgroup, css);
+}
+
+extern int swap_cgroup_charge(struct page *page,
+				struct swap_info_struct *si,
+				unsigned long offset);
+extern void swap_cgroup_uncharge(struct swap_info_struct *si,
+				unsigned long offset);
+
+#else /* CONFIG_CGROUP_SWAP_LIMIT */
+static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
+{
+	return NULL;
+}
+
+static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
+{
+	return NULL;
+}
+
+static inline int swap_cgroup_charge(struct page *page,
+					struct swap_info_struct *si,
+					unsigned long offset)
+{
+	return 0;
+}
+
+static inline void swap_cgroup_uncharge(struct swap_info_struct *si,
+					unsigned long offset)
+{
+}
+
+#endif
+
+#endif
diff -uprN linux-2.6.24-mm1/init/Kconfig linux-2.6.24-mm1-swaplimit/init/Kconfig
--- linux-2.6.24-mm1/init/Kconfig	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/init/Kconfig	2008-03-03 10:56:56.000000000 +0900
@@ -383,6 +383,12 @@ config CGROUP_MEM_CONT
 	  Provides a memory controller that manages both page cache and
 	  RSS memory.
 
+config CGROUP_SWAP_LIMIT
+	bool "cgroup subsystem for swap"
+	depends on CGROUP_MEM_CONT && SWAP
+	help
+	  Provides a swap controller that manages and limits swap usage.
+
 config PROC_PID_CPUSET
 	bool "Include legacy /proc/<pid>/cpuset file"
 	depends on CPUSETS
diff -uprN linux-2.6.24-mm1/mm/Makefile linux-2.6.24-mm1-swaplimit/mm/Makefile
--- linux-2.6.24-mm1/mm/Makefile	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/Makefile	2008-03-03 10:56:56.000000000 +0900
@@ -32,4 +32,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
+obj-$(CONFIG_CGROUP_SWAP_LIMIT) += swap_limit.o
 
diff -uprN linux-2.6.24-mm1/mm/memcontrol.c linux-2.6.24-mm1-swaplimit/mm/memcontrol.c
--- linux-2.6.24-mm1/mm/memcontrol.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/memcontrol.c	2008-03-03 10:56:56.000000000 +0900
@@ -19,6 +19,7 @@
 
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
+#include <linux/swap_limit.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -146,18 +147,6 @@ struct mem_cgroup {
 #define PAGE_CGROUP_LOCK_BIT 	0x0
 #define PAGE_CGROUP_LOCK 		(1 << PAGE_CGROUP_LOCK_BIT)
 
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
-	struct list_head lru;		/* per cgroup LRU list */
-	struct page *page;
-	struct mem_cgroup *mem_cgroup;
-	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
-					/* mapped and cached states     */
-	int	 flags;
-};
 #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
 #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
 
@@ -254,15 +243,27 @@ struct mem_cgroup *mem_cgroup_from_task(
 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
 {
 	struct mem_cgroup *mem;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup *swap;
+#endif
 
 	mem = mem_cgroup_from_task(p);
 	css_get(&mem->css);
 	mm->mem_cgroup = mem;
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	swap = swap_cgroup_from_task(p);
+	css_get(&swap->css);
+	mm->swap_cgroup = swap;
+#endif
 }
 
 void mm_free_cgroup(struct mm_struct *mm)
 {
 	css_put(&mm->mem_cgroup->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	css_put(&mm->swap_cgroup->css);
+#endif
 }
 
 static inline int page_cgroup_locked(struct page *page)
@@ -664,6 +665,10 @@ retry:
 	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
 		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	atomic_inc(&mm->mm_count);
+	pc->pc_mm = mm;
+#endif
 
 	if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
 		/*
@@ -673,6 +678,9 @@ retry:
 		 */
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+		mmdrop(mm);
+#endif
 		kfree(pc);
 		if (!page)
 			goto done;
@@ -744,6 +752,9 @@ void mem_cgroup_uncharge(struct page_cgr
 		if (clear_page_cgroup(page, pc) == pc) {
 			mem = pc->mem_cgroup;
 			css_put(&mem->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+			mmdrop(pc->pc_mm);
+#endif
 			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			spin_lock_irqsave(&mz->lru_lock, flags);
 			__mem_cgroup_remove_list(pc);
@@ -859,6 +870,9 @@ retry:
 		atomic_set(&pc->ref_cnt, 0);
 		if (clear_page_cgroup(page, pc) == pc) {
 			css_put(&mem->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+			mmdrop(pc->pc_mm);
+#endif
 			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			__mem_cgroup_remove_list(pc);
 			kfree(pc);
diff -uprN linux-2.6.24-mm1/mm/shmem.c linux-2.6.24-mm1-swaplimit/mm/shmem.c
--- linux-2.6.24-mm1/mm/shmem.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/shmem.c	2008-03-03 10:56:56.000000000 +0900
@@ -1024,7 +1024,7 @@ static int shmem_writepage(struct page *
 	 * want to check if there's a redundant swappage to be discarded.
 	 */
 	if (wbc->for_reclaim)
-		swap = get_swap_page();
+		swap = get_swap_page(page);
 	else
 		swap.val = 0;
 
diff -uprN linux-2.6.24-mm1/mm/swap_limit.c linux-2.6.24-mm1-swaplimit/mm/swap_limit.c
--- linux-2.6.24-mm1/mm/swap_limit.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/swap_limit.c	2008-03-05 14:39:23.000000000 +0900
@@ -0,0 +1,194 @@
+/*
+ * swap_limit.c - SWAP controller (based on memcontrol.c)
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/rcupdate.h>
+#include <linux/cgroup.h>
+#include <linux/res_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/swap_limit.h>
+
+static struct swap_cgroup init_swap_cgroup;
+
+int swap_cgroup_charge(struct page *page,
+			struct swap_info_struct *si,
+			unsigned long offset)
+{
+	int ret;
+	struct page_cgroup *pc;
+	struct mm_struct *mm;
+	struct swap_cgroup *swap;
+
+	BUG_ON(!page);
+
+	/*
+	 * Pages to be swapped out should have been charged by memory cgroup,
+	 * but very rarely, pc would be NULL (pc is not reliable without lock,
+	 * so I should fix here).
+	 * In such cases, we charge the init_mm now.
+	 */
+	pc = page_get_page_cgroup(page);
+	if (WARN_ON(!pc))
+		mm = &init_mm;
+	else
+		mm = pc->pc_mm;
+	BUG_ON(!mm);
+
+	rcu_read_lock();
+	swap = rcu_dereference(mm->swap_cgroup);
+	rcu_read_unlock();
+	BUG_ON(!swap);
+
+	ret = res_counter_charge(&swap->res, PAGE_SIZE);
+	if (!ret) {
+		css_get(&swap->css);
+		si->swap_cgroup[offset] = swap;
+	}
+
+	return ret;
+}
+
+void swap_cgroup_uncharge(struct swap_info_struct *si, unsigned long offset)
+{
+	struct swap_cgroup *swap = si->swap_cgroup[offset];
+
+	/*
+	 * "swap" would be NULL:
+	 *  1. when get_swap_page() failed at charging swap_cgroup,
+	 *     and called swap_entry_free().
+	 *  2. when this swap entry had been assigned by
+	 *     get_swap_page_of_type() (via SWSUSP ?).
+	 */
+	if (swap) {
+		res_counter_uncharge(&swap->res, PAGE_SIZE);
+		si->swap_cgroup[offset] = NULL;
+		css_put(&swap->css);
+	}
+}
+
+static struct cgroup_subsys_state *swap_cgroup_create(struct cgroup_subsys *ss,
+						      struct cgroup *cgrp)
+{
+	struct swap_cgroup *swap;
+
+	if (unlikely((cgrp->parent) == NULL)) {
+		swap = &init_swap_cgroup;
+		init_mm.swap_cgroup = swap;
+	} else
+		swap = kzalloc(sizeof(struct swap_cgroup), GFP_KERNEL);
+
+	if (swap == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	res_counter_init(&swap->res);
+
+	return &swap->css;
+}
+
+static void swap_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	kfree(swap_cgroup_from_cgrp(cgrp));
+}
+
+static ssize_t swap_cgroup_read(struct cgroup *cgrp,
+				struct cftype *cft, struct file *file,
+				char __user *userbuf, size_t nbytes,
+				loff_t *ppos)
+{
+	return res_counter_read(&swap_cgroup_from_cgrp(cgrp)->res,
+				cft->private, userbuf, nbytes, ppos,
+				NULL);
+}
+
+static int swap_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+{
+	*tmp = memparse(buf, &buf);
+	if (*buf != '\0')
+		return -EINVAL;
+
+	/*
+	 * Round up the value to the closest page size
+	 */
+	*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
+	return 0;
+}
+
+static ssize_t swap_cgroup_write(struct cgroup *cgrp, struct cftype *cft,
+				 struct file *file, const char __user *userbuf,
+				 size_t nbytes, loff_t *ppos)
+{
+	return res_counter_write(&swap_cgroup_from_cgrp(cgrp)->res,
+				 cft->private, userbuf, nbytes, ppos,
+				 swap_cgroup_write_strategy);
+}
+
+static struct cftype swap_files[] = {
+	{
+		.name = "usage_in_bytes",
+		.private = RES_USAGE,
+		.read = swap_cgroup_read,
+	},
+	{
+		.name = "limit_in_bytes",
+		.private = RES_LIMIT,
+		.write = swap_cgroup_write,
+		.read = swap_cgroup_read,
+	},
+	{
+		.name = "failcnt",
+		.private = RES_FAILCNT,
+		.read = swap_cgroup_read,
+	},
+};
+
+static int swap_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	return cgroup_add_files(cgrp, ss, swap_files, ARRAY_SIZE(swap_files));
+}
+
+static void swap_cgroup_move_task(struct cgroup_subsys *ss,
+				  struct cgroup *cgrp,
+				  struct cgroup *old_cgrp,
+				  struct task_struct *p)
+{
+	struct mm_struct *mm;
+	struct swap_cgroup *swap, *old_swap;
+
+	mm = get_task_mm(p);
+	if (mm == NULL)
+		return;
+
+	swap = swap_cgroup_from_cgrp(cgrp);
+	old_swap = swap_cgroup_from_cgrp(old_cgrp);
+
+	if (swap == old_swap)
+		goto out;
+
+	if (p->tgid != p->pid)
+		goto out;
+
+	css_get(&swap->css);
+	rcu_assign_pointer(mm->swap_cgroup, swap);
+	css_put(&old_swap->css);
+
+out:
+	mmput(mm);
+	return;
+}
+
+struct cgroup_subsys swap_subsys = {
+	.name = "swap",
+	.create = swap_cgroup_create,
+	.destroy = swap_cgroup_destroy,
+	.populate = swap_cgroup_populate,
+	.subsys_id = swap_subsys_id,
+	.attach = swap_cgroup_move_task,
+	.early_init = 0,
+};
diff -uprN linux-2.6.24-mm1/mm/swap_state.c linux-2.6.24-mm1-swaplimit/mm/swap_state.c
--- linux-2.6.24-mm1/mm/swap_state.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/swap_state.c	2008-03-03 10:56:56.000000000 +0900
@@ -128,7 +128,7 @@ int add_to_swap(struct page * page, gfp_
 	BUG_ON(!PageUptodate(page));
 
 	for (;;) {
-		entry = get_swap_page();
+		entry = get_swap_page(page);
 		if (!entry.val)
 			return 0;
 
diff -uprN linux-2.6.24-mm1/mm/swapfile.c linux-2.6.24-mm1-swaplimit/mm/swapfile.c
--- linux-2.6.24-mm1/mm/swapfile.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/swapfile.c	2008-03-03 10:56:56.000000000 +0900
@@ -28,6 +28,7 @@
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
+#include <linux/swap_limit.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -172,7 +173,10 @@ no_page:
 	return 0;
 }
 
-swp_entry_t get_swap_page(void)
+/* get_swap_page() calls this */
+static int swap_entry_free(struct swap_info_struct *, unsigned long);
+
+swp_entry_t get_swap_page(struct page *page)
 {
 	struct swap_info_struct *si;
 	pgoff_t offset;
@@ -201,6 +205,16 @@ swp_entry_t get_swap_page(void)
 		swap_list.next = next;
 		offset = scan_swap_map(si);
 		if (offset) {
+			/*
+			 * This should be the first use of this swap entry,
+			 * so charge this swap entry now.
+			 */
+			if (swap_cgroup_charge(page, si, offset)) {
+				/* should free this entry */
+				swap_entry_free(si, offset);
+
+				goto noswap;
+			}
 			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
 		}
@@ -285,6 +299,7 @@ static int swap_entry_free(struct swap_i
 				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
+			swap_cgroup_uncharge(p, offset);
 		}
 	}
 	return count;
@@ -1207,6 +1222,9 @@ asmlinkage long sys_swapoff(const char _
 {
 	struct swap_info_struct * p = NULL;
 	unsigned short *swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup **swap_cgroup;
+#endif
 	struct file *swap_file, *victim;
 	struct address_space *mapping;
 	struct inode *inode;
@@ -1309,10 +1327,17 @@ asmlinkage long sys_swapoff(const char _
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	swap_cgroup = p->swap_cgroup;
+	p->swap_cgroup = NULL;
+#endif
 	p->flags = 0;
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	vfree(swap_cgroup);
+#endif
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
@@ -1460,6 +1485,9 @@ asmlinkage long sys_swapon(const char __
 	unsigned long maxpages = 1;
 	int swapfilesize;
 	unsigned short *swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup **swap_cgroup;
+#endif
 	struct page *page = NULL;
 	struct inode *inode = NULL;
 	int did_down = 0;
@@ -1483,6 +1511,9 @@ asmlinkage long sys_swapon(const char __
 	p->swap_file = NULL;
 	p->old_block_size = 0;
 	p->swap_map = NULL;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	p->swap_cgroup = NULL;
+#endif
 	p->lowest_bit = 0;
 	p->highest_bit = 0;
 	p->cluster_nr = 0;
@@ -1647,6 +1678,15 @@ asmlinkage long sys_swapon(const char __
 				1 /* header page */;
 		if (error)
 			goto bad_swap;
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+		p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
+		if (!(p->swap_cgroup)) {
+			error = -ENOMEM;
+			goto bad_swap;
+		}
+		memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
+#endif
 	}
 
 	if (nr_good_pages) {
@@ -1704,13 +1744,22 @@ bad_swap:
 bad_swap_2:
 	spin_lock(&swap_lock);
 	swap_map = p->swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	swap_cgroup = p->swap_cgroup;
+#endif
 	p->swap_file = NULL;
 	p->swap_map = NULL;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	p->swap_cgroup = NULL;
+#endif
 	p->flags = 0;
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
 	spin_unlock(&swap_lock);
 	vfree(swap_map);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	vfree(swap_cgroup);
+#endif
 	if (swap_file)
 		filp_close(swap_file, NULL);
 out:

^ permalink raw reply	[flat|nested] 50+ messages in thread

* [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  5:59 ` Daisuke Nishimura
  0 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-05  5:59 UTC (permalink / raw)
  To: containers, linux-mm; +Cc: balbir, xemul, kamezawa.hiroyu

Hi.

Even if limiting memory usage by cgroup memory subsystem
or isolating memory by cpuset, swap space is shared, so
resource isolation is not enough. If one group uses up all the
swap space, it can affect other groups.

I try making a patch of swap subsystem based on memory
subsystem, which limits swap usage per cgroup.
It can now charge and limit the swap usage.

I implemented this feature as a new subsystem,
not as a part of memory subsystem, because I don't want to
make big change to memcontrol.c, and even if implemented
as other subsystem, users can manage memory and swap on
the same cgroup directory if mount them together.

Basic idea of my implementation:
  - what will be charged ?
    the number of swap entries.

  - when to charge/uncharge ?
    charge at get_swap_entry(), and uncharge at swap_entry_free().

  - to what group charge the swap entry ?
    To determine to what swap_cgroup (corresponding to mem_cgroup in
    memory subsystem) the swap entry should be charged,
    I added a pointer to mm_struct to page_cgroup(pc->pc_mm), and
    changed the argument of get_swap_entry() from (void) to
    (struct page *). As a result, get_swap_entry() can determine
    to what swap_cgroup it should charge the swap entry
    by referring to page->page_cgroup->mm_struct->swap_cgroup.

  - from what group uncharge the swap entry ?
    I added to swap_info_struct a member 'struct swap_cgroup **',
    array of pointer to which swap_cgroup the swap entry is
    charged.

Todo:
  - rebase new kernel, and split into some patches.
  - Merge with memory subsystem (if it would be better), or
    remove dependency on CONFIG_CGROUP_MEM_CONT if possible
    (needs to make page_cgroup more generic one).
  - More tests, cleanups, and feartures   :-)  


Any comments or discussions would be appreciated.

Thanks,
Daisuke Nishimura


Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>

---
diff -uprN linux-2.6.24-mm1/include/linux/cgroup_subsys.h linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h
--- linux-2.6.24-mm1/include/linux/cgroup_subsys.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h	2008-03-03 10:56:56.000000000 +0900
@@ -42,3 +42,9 @@ SUBSYS(mem_cgroup)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+SUBSYS(swap)
+#endif
+
+/* */
diff -uprN linux-2.6.24-mm1/include/linux/memcontrol.h linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h
--- linux-2.6.24-mm1/include/linux/memcontrol.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h	2008-03-03 10:56:56.000000000 +0900
@@ -29,6 +29,21 @@ struct page;
 struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_CONT
+/*
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ */
+struct page_cgroup {
+	struct list_head lru;		/* per cgroup LRU list */
+	struct page *page;
+	struct mem_cgroup *mem_cgroup;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct mm_struct *pc_mm;
+#endif
+	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
+					/* mapped and cached states     */
+	int	 flags;
+};
 
 extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
 extern void mm_free_cgroup(struct mm_struct *mm);
diff -uprN linux-2.6.24-mm1/include/linux/mm_types.h linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h
--- linux-2.6.24-mm1/include/linux/mm_types.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h	2008-03-03 10:56:56.000000000 +0900
@@ -233,6 +233,9 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_CONT
 	struct mem_cgroup *mem_cgroup;
 #endif
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup *swap_cgroup;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
diff -uprN linux-2.6.24-mm1/include/linux/swap.h linux-2.6.24-mm1-swaplimit/include/linux/swap.h
--- linux-2.6.24-mm1/include/linux/swap.h	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/swap.h	2008-03-03 10:56:56.000000000 +0900
@@ -7,6 +7,7 @@
 #include <linux/list.h>
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
+#include <linux/swap_limit.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -141,6 +142,9 @@ struct swap_info_struct {
 	struct swap_extent *curr_swap_extent;
 	unsigned old_block_size;
 	unsigned short * swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup **swap_cgroup;
+#endif
 	unsigned int lowest_bit;
 	unsigned int highest_bit;
 	unsigned int cluster_next;
@@ -239,7 +243,7 @@ extern struct page *swapin_readahead(swp
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
 extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page(struct page *);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
@@ -342,7 +346,7 @@ static inline int remove_exclusive_swap_
 	return 0;
 }
 
-static inline swp_entry_t get_swap_page(void)
+static inline swp_entry_t get_swap_page(struct page *page)
 {
 	swp_entry_t entry;
 	entry.val = 0;
diff -uprN linux-2.6.24-mm1/include/linux/swap_limit.h linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h
--- linux-2.6.24-mm1/include/linux/swap_limit.h	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h	2008-03-03 10:56:56.000000000 +0900
@@ -0,0 +1,65 @@
+/*
+ * swap_limit.h
+ *
+ */
+#ifndef _LINUX_SWAP_LIMIT_H
+#define _LINUX_SWAP_LIMIT_H
+
+#include <linux/swap.h>
+#include <linux/cgroup.h>
+#include <linux/res_counter.h>
+
+struct swap_cgroup;
+struct swap_info_struct;
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+struct swap_cgroup {
+	struct cgroup_subsys_state css;
+	struct res_counter res;
+};
+
+static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
+{
+	return container_of(cgroup_subsys_state(cgrp, swap_subsys_id),
+				struct swap_cgroup,
+				css);
+}
+
+static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
+{
+	return container_of(task_subsys_state(p, swap_subsys_id),
+				struct swap_cgroup, css);
+}
+
+extern int swap_cgroup_charge(struct page *page,
+				struct swap_info_struct *si,
+				unsigned long offset);
+extern void swap_cgroup_uncharge(struct swap_info_struct *si,
+				unsigned long offset);
+
+#else /* CONFIG_CGROUP_SWAP_LIMIT */
+static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
+{
+	return NULL;
+}
+
+static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
+{
+	return NULL;
+}
+
+static inline int swap_cgroup_charge(struct page *page,
+					struct swap_info_struct *si,
+					unsigned long offset)
+{
+	return 0;
+}
+
+static inline void swap_cgroup_uncharge(struct swap_info_struct *si,
+					unsigned long offset)
+{
+}
+
+#endif
+
+#endif
diff -uprN linux-2.6.24-mm1/init/Kconfig linux-2.6.24-mm1-swaplimit/init/Kconfig
--- linux-2.6.24-mm1/init/Kconfig	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/init/Kconfig	2008-03-03 10:56:56.000000000 +0900
@@ -383,6 +383,12 @@ config CGROUP_MEM_CONT
 	  Provides a memory controller that manages both page cache and
 	  RSS memory.
 
+config CGROUP_SWAP_LIMIT
+	bool "cgroup subsystem for swap"
+	depends on CGROUP_MEM_CONT && SWAP
+	help
+	  Provides a swap controller that manages and limits swap usage.
+
 config PROC_PID_CPUSET
 	bool "Include legacy /proc/<pid>/cpuset file"
 	depends on CPUSETS
diff -uprN linux-2.6.24-mm1/mm/Makefile linux-2.6.24-mm1-swaplimit/mm/Makefile
--- linux-2.6.24-mm1/mm/Makefile	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/Makefile	2008-03-03 10:56:56.000000000 +0900
@@ -32,4 +32,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
+obj-$(CONFIG_CGROUP_SWAP_LIMIT) += swap_limit.o
 
diff -uprN linux-2.6.24-mm1/mm/memcontrol.c linux-2.6.24-mm1-swaplimit/mm/memcontrol.c
--- linux-2.6.24-mm1/mm/memcontrol.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/memcontrol.c	2008-03-03 10:56:56.000000000 +0900
@@ -19,6 +19,7 @@
 
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
+#include <linux/swap_limit.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -146,18 +147,6 @@ struct mem_cgroup {
 #define PAGE_CGROUP_LOCK_BIT 	0x0
 #define PAGE_CGROUP_LOCK 		(1 << PAGE_CGROUP_LOCK_BIT)
 
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
-	struct list_head lru;		/* per cgroup LRU list */
-	struct page *page;
-	struct mem_cgroup *mem_cgroup;
-	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
-					/* mapped and cached states     */
-	int	 flags;
-};
 #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
 #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
 
@@ -254,15 +243,27 @@ struct mem_cgroup *mem_cgroup_from_task(
 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
 {
 	struct mem_cgroup *mem;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup *swap;
+#endif
 
 	mem = mem_cgroup_from_task(p);
 	css_get(&mem->css);
 	mm->mem_cgroup = mem;
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	swap = swap_cgroup_from_task(p);
+	css_get(&swap->css);
+	mm->swap_cgroup = swap;
+#endif
 }
 
 void mm_free_cgroup(struct mm_struct *mm)
 {
 	css_put(&mm->mem_cgroup->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	css_put(&mm->swap_cgroup->css);
+#endif
 }
 
 static inline int page_cgroup_locked(struct page *page)
@@ -664,6 +665,10 @@ retry:
 	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
 		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	atomic_inc(&mm->mm_count);
+	pc->pc_mm = mm;
+#endif
 
 	if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
 		/*
@@ -673,6 +678,9 @@ retry:
 		 */
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+		mmdrop(mm);
+#endif
 		kfree(pc);
 		if (!page)
 			goto done;
@@ -744,6 +752,9 @@ void mem_cgroup_uncharge(struct page_cgr
 		if (clear_page_cgroup(page, pc) == pc) {
 			mem = pc->mem_cgroup;
 			css_put(&mem->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+			mmdrop(pc->pc_mm);
+#endif
 			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			spin_lock_irqsave(&mz->lru_lock, flags);
 			__mem_cgroup_remove_list(pc);
@@ -859,6 +870,9 @@ retry:
 		atomic_set(&pc->ref_cnt, 0);
 		if (clear_page_cgroup(page, pc) == pc) {
 			css_put(&mem->css);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+			mmdrop(pc->pc_mm);
+#endif
 			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			__mem_cgroup_remove_list(pc);
 			kfree(pc);
diff -uprN linux-2.6.24-mm1/mm/shmem.c linux-2.6.24-mm1-swaplimit/mm/shmem.c
--- linux-2.6.24-mm1/mm/shmem.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/shmem.c	2008-03-03 10:56:56.000000000 +0900
@@ -1024,7 +1024,7 @@ static int shmem_writepage(struct page *
 	 * want to check if there's a redundant swappage to be discarded.
 	 */
 	if (wbc->for_reclaim)
-		swap = get_swap_page();
+		swap = get_swap_page(page);
 	else
 		swap.val = 0;
 
diff -uprN linux-2.6.24-mm1/mm/swap_limit.c linux-2.6.24-mm1-swaplimit/mm/swap_limit.c
--- linux-2.6.24-mm1/mm/swap_limit.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/swap_limit.c	2008-03-05 14:39:23.000000000 +0900
@@ -0,0 +1,194 @@
+/*
+ * swap_limit.c - SWAP controller (based on memcontrol.c)
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/rcupdate.h>
+#include <linux/cgroup.h>
+#include <linux/res_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/swap_limit.h>
+
+static struct swap_cgroup init_swap_cgroup;
+
+int swap_cgroup_charge(struct page *page,
+			struct swap_info_struct *si,
+			unsigned long offset)
+{
+	int ret;
+	struct page_cgroup *pc;
+	struct mm_struct *mm;
+	struct swap_cgroup *swap;
+
+	BUG_ON(!page);
+
+	/*
+	 * Pages to be swapped out should have been charged by memory cgroup,
+	 * but very rarely, pc would be NULL (pc is not reliable without lock,
+	 * so I should fix here).
+	 * In such cases, we charge the init_mm now.
+	 */
+	pc = page_get_page_cgroup(page);
+	if (WARN_ON(!pc))
+		mm = &init_mm;
+	else
+		mm = pc->pc_mm;
+	BUG_ON(!mm);
+
+	rcu_read_lock();
+	swap = rcu_dereference(mm->swap_cgroup);
+	rcu_read_unlock();
+	BUG_ON(!swap);
+
+	ret = res_counter_charge(&swap->res, PAGE_SIZE);
+	if (!ret) {
+		css_get(&swap->css);
+		si->swap_cgroup[offset] = swap;
+	}
+
+	return ret;
+}
+
+void swap_cgroup_uncharge(struct swap_info_struct *si, unsigned long offset)
+{
+	struct swap_cgroup *swap = si->swap_cgroup[offset];
+
+	/*
+	 * "swap" would be NULL:
+	 *  1. when get_swap_page() failed at charging swap_cgroup,
+	 *     and called swap_entry_free().
+	 *  2. when this swap entry had been assigned by
+	 *     get_swap_page_of_type() (via SWSUSP ?).
+	 */
+	if (swap) {
+		res_counter_uncharge(&swap->res, PAGE_SIZE);
+		si->swap_cgroup[offset] = NULL;
+		css_put(&swap->css);
+	}
+}
+
+static struct cgroup_subsys_state *swap_cgroup_create(struct cgroup_subsys *ss,
+						      struct cgroup *cgrp)
+{
+	struct swap_cgroup *swap;
+
+	if (unlikely((cgrp->parent) == NULL)) {
+		swap = &init_swap_cgroup;
+		init_mm.swap_cgroup = swap;
+	} else
+		swap = kzalloc(sizeof(struct swap_cgroup), GFP_KERNEL);
+
+	if (swap == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	res_counter_init(&swap->res);
+
+	return &swap->css;
+}
+
+static void swap_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	kfree(swap_cgroup_from_cgrp(cgrp));
+}
+
+static ssize_t swap_cgroup_read(struct cgroup *cgrp,
+				struct cftype *cft, struct file *file,
+				char __user *userbuf, size_t nbytes,
+				loff_t *ppos)
+{
+	return res_counter_read(&swap_cgroup_from_cgrp(cgrp)->res,
+				cft->private, userbuf, nbytes, ppos,
+				NULL);
+}
+
+static int swap_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+{
+	*tmp = memparse(buf, &buf);
+	if (*buf != '\0')
+		return -EINVAL;
+
+	/*
+	 * Round up the value to the closest page size
+	 */
+	*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
+	return 0;
+}
+
+static ssize_t swap_cgroup_write(struct cgroup *cgrp, struct cftype *cft,
+				 struct file *file, const char __user *userbuf,
+				 size_t nbytes, loff_t *ppos)
+{
+	return res_counter_write(&swap_cgroup_from_cgrp(cgrp)->res,
+				 cft->private, userbuf, nbytes, ppos,
+				 swap_cgroup_write_strategy);
+}
+
+static struct cftype swap_files[] = {
+	{
+		.name = "usage_in_bytes",
+		.private = RES_USAGE,
+		.read = swap_cgroup_read,
+	},
+	{
+		.name = "limit_in_bytes",
+		.private = RES_LIMIT,
+		.write = swap_cgroup_write,
+		.read = swap_cgroup_read,
+	},
+	{
+		.name = "failcnt",
+		.private = RES_FAILCNT,
+		.read = swap_cgroup_read,
+	},
+};
+
+static int swap_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	return cgroup_add_files(cgrp, ss, swap_files, ARRAY_SIZE(swap_files));
+}
+
+static void swap_cgroup_move_task(struct cgroup_subsys *ss,
+				  struct cgroup *cgrp,
+				  struct cgroup *old_cgrp,
+				  struct task_struct *p)
+{
+	struct mm_struct *mm;
+	struct swap_cgroup *swap, *old_swap;
+
+	mm = get_task_mm(p);
+	if (mm == NULL)
+		return;
+
+	swap = swap_cgroup_from_cgrp(cgrp);
+	old_swap = swap_cgroup_from_cgrp(old_cgrp);
+
+	if (swap == old_swap)
+		goto out;
+
+	if (p->tgid != p->pid)
+		goto out;
+
+	css_get(&swap->css);
+	rcu_assign_pointer(mm->swap_cgroup, swap);
+	css_put(&old_swap->css);
+
+out:
+	mmput(mm);
+	return;
+}
+
+struct cgroup_subsys swap_subsys = {
+	.name = "swap",
+	.create = swap_cgroup_create,
+	.destroy = swap_cgroup_destroy,
+	.populate = swap_cgroup_populate,
+	.subsys_id = swap_subsys_id,
+	.attach = swap_cgroup_move_task,
+	.early_init = 0,
+};
diff -uprN linux-2.6.24-mm1/mm/swap_state.c linux-2.6.24-mm1-swaplimit/mm/swap_state.c
--- linux-2.6.24-mm1/mm/swap_state.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/swap_state.c	2008-03-03 10:56:56.000000000 +0900
@@ -128,7 +128,7 @@ int add_to_swap(struct page * page, gfp_
 	BUG_ON(!PageUptodate(page));
 
 	for (;;) {
-		entry = get_swap_page();
+		entry = get_swap_page(page);
 		if (!entry.val)
 			return 0;
 
diff -uprN linux-2.6.24-mm1/mm/swapfile.c linux-2.6.24-mm1-swaplimit/mm/swapfile.c
--- linux-2.6.24-mm1/mm/swapfile.c	2008-02-04 14:34:24.000000000 +0900
+++ linux-2.6.24-mm1-swaplimit/mm/swapfile.c	2008-03-03 10:56:56.000000000 +0900
@@ -28,6 +28,7 @@
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
+#include <linux/swap_limit.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -172,7 +173,10 @@ no_page:
 	return 0;
 }
 
-swp_entry_t get_swap_page(void)
+/* get_swap_page() calls this */
+static int swap_entry_free(struct swap_info_struct *, unsigned long);
+
+swp_entry_t get_swap_page(struct page *page)
 {
 	struct swap_info_struct *si;
 	pgoff_t offset;
@@ -201,6 +205,16 @@ swp_entry_t get_swap_page(void)
 		swap_list.next = next;
 		offset = scan_swap_map(si);
 		if (offset) {
+			/*
+			 * This should be the first use of this swap entry,
+			 * so charge this swap entry now.
+			 */
+			if (swap_cgroup_charge(page, si, offset)) {
+				/* should free this entry */
+				swap_entry_free(si, offset);
+
+				goto noswap;
+			}
 			spin_unlock(&swap_lock);
 			return swp_entry(type, offset);
 		}
@@ -285,6 +299,7 @@ static int swap_entry_free(struct swap_i
 				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
+			swap_cgroup_uncharge(p, offset);
 		}
 	}
 	return count;
@@ -1207,6 +1222,9 @@ asmlinkage long sys_swapoff(const char _
 {
 	struct swap_info_struct * p = NULL;
 	unsigned short *swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup **swap_cgroup;
+#endif
 	struct file *swap_file, *victim;
 	struct address_space *mapping;
 	struct inode *inode;
@@ -1309,10 +1327,17 @@ asmlinkage long sys_swapoff(const char _
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	swap_cgroup = p->swap_cgroup;
+	p->swap_cgroup = NULL;
+#endif
 	p->flags = 0;
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	vfree(swap_cgroup);
+#endif
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
@@ -1460,6 +1485,9 @@ asmlinkage long sys_swapon(const char __
 	unsigned long maxpages = 1;
 	int swapfilesize;
 	unsigned short *swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	struct swap_cgroup **swap_cgroup;
+#endif
 	struct page *page = NULL;
 	struct inode *inode = NULL;
 	int did_down = 0;
@@ -1483,6 +1511,9 @@ asmlinkage long sys_swapon(const char __
 	p->swap_file = NULL;
 	p->old_block_size = 0;
 	p->swap_map = NULL;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	p->swap_cgroup = NULL;
+#endif
 	p->lowest_bit = 0;
 	p->highest_bit = 0;
 	p->cluster_nr = 0;
@@ -1647,6 +1678,15 @@ asmlinkage long sys_swapon(const char __
 				1 /* header page */;
 		if (error)
 			goto bad_swap;
+
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+		p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
+		if (!(p->swap_cgroup)) {
+			error = -ENOMEM;
+			goto bad_swap;
+		}
+		memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
+#endif
 	}
 
 	if (nr_good_pages) {
@@ -1704,13 +1744,22 @@ bad_swap:
 bad_swap_2:
 	spin_lock(&swap_lock);
 	swap_map = p->swap_map;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	swap_cgroup = p->swap_cgroup;
+#endif
 	p->swap_file = NULL;
 	p->swap_map = NULL;
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	p->swap_cgroup = NULL;
+#endif
 	p->flags = 0;
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
 	spin_unlock(&swap_lock);
 	vfree(swap_map);
+#ifdef CONFIG_CGROUP_SWAP_LIMIT
+	vfree(swap_cgroup);
+#endif
 	if (swap_file)
 		filp_close(swap_file, NULL);
 out:


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CE36A9.3060204-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  5:59 ` Daisuke Nishimura
@ 2008-03-05  6:36     ` Paul Menage
  -1 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-05  6:36 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, xemul-GEFAQzZX7r8dnm+yROfE0A,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Hi Daisuke,

Most of my comments below are to do with style issues with cgroups,
rather than the details of the memory management code.

2008/3/4 Daisuke Nishimura <nishimura-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org>:
>  +/*
>  + * A page_cgroup page is associated with every page descriptor. The
>  + * page_cgroup helps us identify information about the cgroup
>  + */
>  +struct page_cgroup {
>  +       struct list_head lru;           /* per cgroup LRU list */
>  +       struct page *page;
>  +       struct mem_cgroup *mem_cgroup;
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +       struct mm_struct *pc_mm;
>  +#endif
>  +       atomic_t ref_cnt;               /* Helpful when pages move b/w  */
>  +                                       /* mapped and cached states     */
>  +       int      flags;
>  +};
>
>  +
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +struct swap_cgroup {
>  +       struct cgroup_subsys_state css;
>  +       struct res_counter res;
>  +};
>  +
>  +static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
>  +{
>  +       return container_of(cgroup_subsys_state(cgrp, swap_subsys_id),
>  +                               struct swap_cgroup,
>  +                               css);
>  +}
>  +
>  +static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
>  +{
>  +       return container_of(task_subsys_state(p, swap_subsys_id),
>  +                               struct swap_cgroup, css);
>  +}

Can't these definitions be moved into swap_limit.c?

>  @@ -254,15 +243,27 @@ struct mem_cgroup *mem_cgroup_from_task(
>   void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
>   {
>         struct mem_cgroup *mem;
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +       struct swap_cgroup *swap;
>  +#endif
>
>         mem = mem_cgroup_from_task(p);
>         css_get(&mem->css);
>         mm->mem_cgroup = mem;
>  +
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +       swap = swap_cgroup_from_task(p);
>  +       css_get(&swap->css);
>  +       mm->swap_cgroup = swap;
>  +#endif

My feeling is that it would be cleaner to move this code into
swap_limit.c, and have a separate mm_init_swap_cgroup() function. (And
a mm_free_swap_cgroup() function).

>  +       pc = page_get_page_cgroup(page);
>  +       if (WARN_ON(!pc))
>  +               mm = &init_mm;
>  +       else
>  +               mm = pc->pc_mm;
>  +       BUG_ON(!mm);

Is this safe against races with the mem.force_empty operation?

>  +
>  +       rcu_read_lock();
>  +       swap = rcu_dereference(mm->swap_cgroup);
>  +       rcu_read_unlock();
>  +       BUG_ON(!swap);

Is it safe to do rcu_read_unlock() while you are still planning to
operate on the value of "swap"?

>  +
>  +static ssize_t swap_cgroup_read(struct cgroup *cgrp,
>  +                               struct cftype *cft, struct file *file,
>  +                               char __user *userbuf, size_t nbytes,
>  +                               loff_t *ppos)
>  +{
>  +       return res_counter_read(&swap_cgroup_from_cgrp(cgrp)->res,
>  +                               cft->private, userbuf, nbytes, ppos,
>  +                               NULL);
>  +}

Can you use the cgroups read_u64 method, and just call res_counter_read_u64?

>  +
>  +static int swap_cgroup_write_strategy(char *buf, unsigned long long *tmp)
>  +{
>  +       *tmp = memparse(buf, &buf);
>  +       if (*buf != '\0')
>  +               return -EINVAL;
>  +
>  +       /*
>  +        * Round up the value to the closest page size
>  +        */
>  +       *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
>  +       return 0;
>  +}

This is the same as mem_cgroup_write_strategy. As part of your patch,
can you create a res_counter_write_pagealign() strategy function in
res_counter.c and use it from the memory and swap cgroups?

>  +
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +               p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
>  +               if (!(p->swap_cgroup)) {
>  +                       error = -ENOMEM;
>  +                       goto bad_swap;
>  +               }
>  +               memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
>  +#endif

It would be nice to only allocate these the first time the swap cgroup
subsystem becomes active, to avoid the overhead for people not using
it; even better if you can free it again if the swap subsystem becomes
inactive again.

Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  6:36     ` Paul Menage
  0 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-05  6:36 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: containers, linux-mm, balbir, xemul, kamezawa.hiroyu

Hi Daisuke,

Most of my comments below are to do with style issues with cgroups,
rather than the details of the memory management code.

2008/3/4 Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>:
>  +/*
>  + * A page_cgroup page is associated with every page descriptor. The
>  + * page_cgroup helps us identify information about the cgroup
>  + */
>  +struct page_cgroup {
>  +       struct list_head lru;           /* per cgroup LRU list */
>  +       struct page *page;
>  +       struct mem_cgroup *mem_cgroup;
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +       struct mm_struct *pc_mm;
>  +#endif
>  +       atomic_t ref_cnt;               /* Helpful when pages move b/w  */
>  +                                       /* mapped and cached states     */
>  +       int      flags;
>  +};
>
>  +
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +struct swap_cgroup {
>  +       struct cgroup_subsys_state css;
>  +       struct res_counter res;
>  +};
>  +
>  +static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
>  +{
>  +       return container_of(cgroup_subsys_state(cgrp, swap_subsys_id),
>  +                               struct swap_cgroup,
>  +                               css);
>  +}
>  +
>  +static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
>  +{
>  +       return container_of(task_subsys_state(p, swap_subsys_id),
>  +                               struct swap_cgroup, css);
>  +}

Can't these definitions be moved into swap_limit.c?

>  @@ -254,15 +243,27 @@ struct mem_cgroup *mem_cgroup_from_task(
>   void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
>   {
>         struct mem_cgroup *mem;
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +       struct swap_cgroup *swap;
>  +#endif
>
>         mem = mem_cgroup_from_task(p);
>         css_get(&mem->css);
>         mm->mem_cgroup = mem;
>  +
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +       swap = swap_cgroup_from_task(p);
>  +       css_get(&swap->css);
>  +       mm->swap_cgroup = swap;
>  +#endif

My feeling is that it would be cleaner to move this code into
swap_limit.c, and have a separate mm_init_swap_cgroup() function. (And
a mm_free_swap_cgroup() function).

>  +       pc = page_get_page_cgroup(page);
>  +       if (WARN_ON(!pc))
>  +               mm = &init_mm;
>  +       else
>  +               mm = pc->pc_mm;
>  +       BUG_ON(!mm);

Is this safe against races with the mem.force_empty operation?

>  +
>  +       rcu_read_lock();
>  +       swap = rcu_dereference(mm->swap_cgroup);
>  +       rcu_read_unlock();
>  +       BUG_ON(!swap);

Is it safe to do rcu_read_unlock() while you are still planning to
operate on the value of "swap"?

>  +
>  +static ssize_t swap_cgroup_read(struct cgroup *cgrp,
>  +                               struct cftype *cft, struct file *file,
>  +                               char __user *userbuf, size_t nbytes,
>  +                               loff_t *ppos)
>  +{
>  +       return res_counter_read(&swap_cgroup_from_cgrp(cgrp)->res,
>  +                               cft->private, userbuf, nbytes, ppos,
>  +                               NULL);
>  +}

Can you use the cgroups read_u64 method, and just call res_counter_read_u64?

>  +
>  +static int swap_cgroup_write_strategy(char *buf, unsigned long long *tmp)
>  +{
>  +       *tmp = memparse(buf, &buf);
>  +       if (*buf != '\0')
>  +               return -EINVAL;
>  +
>  +       /*
>  +        * Round up the value to the closest page size
>  +        */
>  +       *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
>  +       return 0;
>  +}

This is the same as mem_cgroup_write_strategy. As part of your patch,
can you create a res_counter_write_pagealign() strategy function in
res_counter.c and use it from the memory and swap cgroups?

>  +
>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>  +               p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
>  +               if (!(p->swap_cgroup)) {
>  +                       error = -ENOMEM;
>  +                       goto bad_swap;
>  +               }
>  +               memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
>  +#endif

It would be nice to only allocate these the first time the swap cgroup
subsystem becomes active, to avoid the overhead for people not using
it; even better if you can free it again if the swap subsystem becomes
inactive again.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <6599ad830803042236x3e5fdf0dmaf4119997025ba40-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  6:36     ` Paul Menage
@ 2008-03-06 12:20         ` Daisuke Nishimura
  -1 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-06 12:20 UTC (permalink / raw)
  To: Paul Menage
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	containers-qjLDD68F18O7TbgM5vRIOg, hugh-DTz5qymZ9yRBDgjK7y7TUQ,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	xemul-GEFAQzZX7r8dnm+yROfE0A

Hi.

Paul Menage wrote:
>>  +       pc = page_get_page_cgroup(page);
>>  +       if (WARN_ON(!pc))
>>  +               mm = &init_mm;
>>  +       else
>>  +               mm = pc->pc_mm;
>>  +       BUG_ON(!mm);
> 
> Is this safe against races with the mem.force_empty operation?
> 
I've not considered yet about force_empty operation
of memory subsystem.
Thank you for pointing it out.

>>  +
>>  +       rcu_read_lock();
>>  +       swap = rcu_dereference(mm->swap_cgroup);
>>  +       rcu_read_unlock();
>>  +       BUG_ON(!swap);
> 
> Is it safe to do rcu_read_unlock() while you are still planning to
> operate on the value of "swap"?
> 
You are right.
I think I should css_get() before rcu_read_unlock() as
memory subsystem does.

>>  +
>>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>>  +               p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
>>  +               if (!(p->swap_cgroup)) {
>>  +                       error = -ENOMEM;
>>  +                       goto bad_swap;
>>  +               }
>>  +               memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
>>  +#endif
> 
> It would be nice to only allocate these the first time the swap cgroup
> subsystem becomes active, to avoid the overhead for people not using
> it; even better if you can free it again if the swap subsystem becomes
> inactive again.
> 
Hmm.. good idea.
I think this is possible by adding a flag file, like "swap.enable_limit",
to the top of cgroup directory, and charging all the swap entries
which are used when the flag is enabled to the top cgroup.



Thanks,
Daisuke Nishimura.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06 12:20         ` Daisuke Nishimura
  0 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-06 12:20 UTC (permalink / raw)
  To: Paul Menage; +Cc: containers, linux-mm, balbir, xemul, kamezawa.hiroyu, hugh

Hi.

Paul Menage wrote:
>>  +       pc = page_get_page_cgroup(page);
>>  +       if (WARN_ON(!pc))
>>  +               mm = &init_mm;
>>  +       else
>>  +               mm = pc->pc_mm;
>>  +       BUG_ON(!mm);
> 
> Is this safe against races with the mem.force_empty operation?
> 
I've not considered yet about force_empty operation
of memory subsystem.
Thank you for pointing it out.

>>  +
>>  +       rcu_read_lock();
>>  +       swap = rcu_dereference(mm->swap_cgroup);
>>  +       rcu_read_unlock();
>>  +       BUG_ON(!swap);
> 
> Is it safe to do rcu_read_unlock() while you are still planning to
> operate on the value of "swap"?
> 
You are right.
I think I should css_get() before rcu_read_unlock() as
memory subsystem does.

>>  +
>>  +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>>  +               p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
>>  +               if (!(p->swap_cgroup)) {
>>  +                       error = -ENOMEM;
>>  +                       goto bad_swap;
>>  +               }
>>  +               memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
>>  +#endif
> 
> It would be nice to only allocate these the first time the swap cgroup
> subsystem becomes active, to avoid the overhead for people not using
> it; even better if you can free it again if the swap subsystem becomes
> inactive again.
> 
Hmm.. good idea.
I think this is possible by adding a flag file, like "swap.enable_limit",
to the top of cgroup directory, and charging all the swap entries
which are used when the flag is enabled to the top cgroup.



Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  5:59 ` Daisuke Nishimura
@ 2008-03-05  6:53     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05  6:53 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, xemul-GEFAQzZX7r8dnm+yROfE0A,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

On Wed, 05 Mar 2008 14:59:05 +0900
Daisuke Nishimura <nishimura-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org> wrote:

>  #ifdef CONFIG_CGROUP_MEM_CONT
> +/*
> + * A page_cgroup page is associated with every page descriptor. The
> + * page_cgroup helps us identify information about the cgroup
> + */
> +struct page_cgroup {
> +	struct list_head lru;		/* per cgroup LRU list */
> +	struct page *page;
> +	struct mem_cgroup *mem_cgroup;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct mm_struct *pc_mm;
> +#endif
> +	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> +					/* mapped and cached states     */
> +	int	 flags;
> +};
>  
As first impression, I don't like to increase size of this...but have no alternative
idea.



>  static inline int page_cgroup_locked(struct page *page)
> @@ -664,6 +665,10 @@ retry:
>  	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
>  		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	atomic_inc(&mm->mm_count);
> +	pc->pc_mm = mm;
> +#endif
>  
Strongly Nack to this atomic_inc(). 
What happens when tmpfs pages goes to swap ?


>  	if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
>  		/*
> @@ -673,6 +678,9 @@ retry:

> +int swap_cgroup_charge(struct page *page,
> +			struct swap_info_struct *si,
> +			unsigned long offset)
> +{
> +	int ret;
> +	struct page_cgroup *pc;
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap;
> +
> +	BUG_ON(!page);
> +
> +	/*
> +	 * Pages to be swapped out should have been charged by memory cgroup,
> +	 * but very rarely, pc would be NULL (pc is not reliable without lock,
> +	 * so I should fix here).
> +	 * In such cases, we charge the init_mm now.
> +	 */
> +	pc = page_get_page_cgroup(page);
> +	if (WARN_ON(!pc))
> +		mm = &init_mm;
> +	else
> +		mm = pc->pc_mm;
> +	BUG_ON(!mm);
> +
> +	rcu_read_lock();
> +	swap = rcu_dereference(mm->swap_cgroup);
> +	rcu_read_unlock();
> +	BUG_ON(!swap);
Is there no race ?

At first look, remembering mm struct is not very good.
Remembering swap controller itself is better.
If you go this direction, how about this way ?

==
enum {
#ifdef CONFIG_CGROUP_MEM_CONT
	MEMORY_RESOURCE_CONTROLLER,
#endif
#ifdef CONFIG_CGROUP_SWAP_CONT
	SWAP_CONTROLLER,
#endif
	NR_PAGE_CONTROLLER,
}

struct page_cgroup {
	......
	void*	controlls[NR_PAGE_CONTROLLER];
	....
};
==

Thanks,
-Kame

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  6:53     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05  6:53 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: containers, linux-mm, balbir, xemul

On Wed, 05 Mar 2008 14:59:05 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

>  #ifdef CONFIG_CGROUP_MEM_CONT
> +/*
> + * A page_cgroup page is associated with every page descriptor. The
> + * page_cgroup helps us identify information about the cgroup
> + */
> +struct page_cgroup {
> +	struct list_head lru;		/* per cgroup LRU list */
> +	struct page *page;
> +	struct mem_cgroup *mem_cgroup;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct mm_struct *pc_mm;
> +#endif
> +	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> +					/* mapped and cached states     */
> +	int	 flags;
> +};
>  
As first impression, I don't like to increase size of this...but have no alternative
idea.



>  static inline int page_cgroup_locked(struct page *page)
> @@ -664,6 +665,10 @@ retry:
>  	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
>  		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	atomic_inc(&mm->mm_count);
> +	pc->pc_mm = mm;
> +#endif
>  
Strongly Nack to this atomic_inc(). 
What happens when tmpfs pages goes to swap ?


>  	if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
>  		/*
> @@ -673,6 +678,9 @@ retry:

> +int swap_cgroup_charge(struct page *page,
> +			struct swap_info_struct *si,
> +			unsigned long offset)
> +{
> +	int ret;
> +	struct page_cgroup *pc;
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap;
> +
> +	BUG_ON(!page);
> +
> +	/*
> +	 * Pages to be swapped out should have been charged by memory cgroup,
> +	 * but very rarely, pc would be NULL (pc is not reliable without lock,
> +	 * so I should fix here).
> +	 * In such cases, we charge the init_mm now.
> +	 */
> +	pc = page_get_page_cgroup(page);
> +	if (WARN_ON(!pc))
> +		mm = &init_mm;
> +	else
> +		mm = pc->pc_mm;
> +	BUG_ON(!mm);
> +
> +	rcu_read_lock();
> +	swap = rcu_dereference(mm->swap_cgroup);
> +	rcu_read_unlock();
> +	BUG_ON(!swap);
Is there no race ?

At first look, remembering mm struct is not very good.
Remembering swap controller itself is better.
If you go this direction, how about this way ?

==
enum {
#ifdef CONFIG_CGROUP_MEM_CONT
	MEMORY_RESOURCE_CONTROLLER,
#endif
#ifdef CONFIG_CGROUP_SWAP_CONT
	SWAP_CONTROLLER,
#endif
	NR_PAGE_CONTROLLER,
}

struct page_cgroup {
	......
	void*	controlls[NR_PAGE_CONTROLLER];
	....
};
==

Thanks,
-Kame





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <20080305155329.60e02f48.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  6:53     ` KAMEZAWA Hiroyuki
@ 2008-03-05 21:51         ` Hirokazu Takahashi
  -1 siblings, 0 replies; 50+ messages in thread
From: Hirokazu Takahashi @ 2008-03-05 21:51 UTC (permalink / raw)
  To: nishimura-YQH0OdQVrdy45+QrQBaojngSJqDPrsil
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	xemul-GEFAQzZX7r8dnm+yROfE0A

Hi,

> >  #ifdef CONFIG_CGROUP_MEM_CONT
> > +/*
> > + * A page_cgroup page is associated with every page descriptor. The
> > + * page_cgroup helps us identify information about the cgroup
> > + */
> > +struct page_cgroup {
> > +	struct list_head lru;		/* per cgroup LRU list */
> > +	struct page *page;
> > +	struct mem_cgroup *mem_cgroup;
> > +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> > +	struct mm_struct *pc_mm;
> > +#endif
> > +	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> > +					/* mapped and cached states     */
> > +	int	 flags;
> > +};
> >  
> As first impression, I don't like to increase size of this...but have no alternative
> idea.

If you really want to make the swap space subsystem and the memory subsystem
work independently each other, you can possibly introduce a new data
structure that binds pages in the swapcache and swap_cgroup.
It would be enough since only a small part of the pages are in the swapcache.

Thanks,
Hirokazu Takahashi.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05 21:51         ` Hirokazu Takahashi
  0 siblings, 0 replies; 50+ messages in thread
From: Hirokazu Takahashi @ 2008-03-05 21:51 UTC (permalink / raw)
  To: nishimura; +Cc: kamezawa.hiroyu, containers, linux-mm, xemul, balbir

Hi,

> >  #ifdef CONFIG_CGROUP_MEM_CONT
> > +/*
> > + * A page_cgroup page is associated with every page descriptor. The
> > + * page_cgroup helps us identify information about the cgroup
> > + */
> > +struct page_cgroup {
> > +	struct list_head lru;		/* per cgroup LRU list */
> > +	struct page *page;
> > +	struct mem_cgroup *mem_cgroup;
> > +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> > +	struct mm_struct *pc_mm;
> > +#endif
> > +	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> > +					/* mapped and cached states     */
> > +	int	 flags;
> > +};
> >  
> As first impression, I don't like to increase size of this...but have no alternative
> idea.

If you really want to make the swap space subsystem and the memory subsystem
work independently each other, you can possibly introduce a new data
structure that binds pages in the swapcache and swap_cgroup.
It would be enough since only a small part of the pages are in the swapcache.

Thanks,
Hirokazu Takahashi.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  6:53     ` KAMEZAWA Hiroyuki
@ 2008-03-06 11:45         ` Daisuke Nishimura
  -1 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-06 11:45 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, xemul-GEFAQzZX7r8dnm+yROfE0A,
	hugh-DTz5qymZ9yRBDgjK7y7TUQ,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Hi.

> At first look, remembering mm struct is not very good.
> Remembering swap controller itself is better.

The swap_cgroup when the page(and page_cgroup) is allocated and
the swap_cgroup when the page is going to be swapped out may be
different by swap_cgroup_move_task(), so I think swap_cgroup
to be charged should be determined at the point of swapout.

Instead of pointing mm_struct from page_cgroup, it would be
better to determine the mm_struct which the page to be swapped
out is belongs to by rmap, and charge swap_cgroup of the mm_struct.
In this implementation, I don't need to add new member to page_cgroup.

What do you think ?

Thanks,
Daisuke Nishimura.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06 11:45         ` Daisuke Nishimura
  0 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-06 11:45 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: containers, linux-mm, balbir, xemul, hugh

Hi.

> At first look, remembering mm struct is not very good.
> Remembering swap controller itself is better.

The swap_cgroup when the page(and page_cgroup) is allocated and
the swap_cgroup when the page is going to be swapped out may be
different by swap_cgroup_move_task(), so I think swap_cgroup
to be charged should be determined at the point of swapout.

Instead of pointing mm_struct from page_cgroup, it would be
better to determine the mm_struct which the page to be swapped
out is belongs to by rmap, and charge swap_cgroup of the mm_struct.
In this implementation, I don't need to add new member to page_cgroup.

What do you think ?

Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CFD957.3060402-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06 11:45         ` Daisuke Nishimura
@ 2008-03-06 12:25             ` Pavel Emelyanov
  -1 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06 12:25 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, hugh-DTz5qymZ9yRBDgjK7y7TUQ,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Daisuke Nishimura wrote:
> Hi.
> 
>> At first look, remembering mm struct is not very good.
>> Remembering swap controller itself is better.
> 
> The swap_cgroup when the page(and page_cgroup) is allocated and
> the swap_cgroup when the page is going to be swapped out may be
> different by swap_cgroup_move_task(), so I think swap_cgroup
> to be charged should be determined at the point of swapout.

No. Since we now do not account for the situation, when pages are
shared between cgroups, we may think, that the cgroup, which the 
page was allocated by and the cgroup, which this pages goes to swap 
in are the same.

> Instead of pointing mm_struct from page_cgroup, it would be
> better to determine the mm_struct which the page to be swapped
> out is belongs to by rmap, and charge swap_cgroup of the mm_struct.
> In this implementation, I don't need to add new member to page_cgroup.
> 
> What do you think ?
> 
> 
> Thanks,
> Daisuke Nishimura.
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06 12:25             ` Pavel Emelyanov
  0 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06 12:25 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: KAMEZAWA Hiroyuki, containers, linux-mm, balbir, hugh

Daisuke Nishimura wrote:
> Hi.
> 
>> At first look, remembering mm struct is not very good.
>> Remembering swap controller itself is better.
> 
> The swap_cgroup when the page(and page_cgroup) is allocated and
> the swap_cgroup when the page is going to be swapped out may be
> different by swap_cgroup_move_task(), so I think swap_cgroup
> to be charged should be determined at the point of swapout.

No. Since we now do not account for the situation, when pages are
shared between cgroups, we may think, that the cgroup, which the 
page was allocated by and the cgroup, which this pages goes to swap 
in are the same.

> Instead of pointing mm_struct from page_cgroup, it would be
> better to determine the mm_struct which the page to be swapped
> out is belongs to by rmap, and charge swap_cgroup of the mm_struct.
> In this implementation, I don't need to add new member to page_cgroup.
> 
> What do you think ?
> 
> 
> Thanks,
> Daisuke Nishimura.
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06 11:45         ` Daisuke Nishimura
@ 2008-03-06 12:56             ` kamezawa.hiroyu
  -1 siblings, 0 replies; 50+ messages in thread
From: kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A @ 2008-03-06 12:56 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	containers-qjLDD68F18O7TbgM5vRIOg, hugh-DTz5qymZ9yRBDgjK7y7TUQ,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	xemul-GEFAQzZX7r8dnm+yROfE0A

>> At first look, remembering mm struct is not very good.
>> Remembering swap controller itself is better.
>
>The swap_cgroup when the page(and page_cgroup) is allocated and
>the swap_cgroup when the page is going to be swapped out may be
>different by swap_cgroup_move_task(), so I think swap_cgroup
>to be charged should be determined at the point of swapout.
>
Accounting swap against an entity which allocs anon memory is
not strange. Problem here is move_task itself.
Now, charges against anon is not moved when a task which uses it
is moved. please fix this behavior first if you think this is
problematic.

But, finally, a daemon driven by process event connector
determines the group before process starts using anon. It's
doubtful that it's worth to add complicated/costly ones.


Thanks,
-Kame

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06 12:56             ` kamezawa.hiroyu
  0 siblings, 0 replies; 50+ messages in thread
From: kamezawa.hiroyu @ 2008-03-06 12:56 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: KAMEZAWA Hiroyuki, containers, linux-mm, balbir, xemul, hugh

>> At first look, remembering mm struct is not very good.
>> Remembering swap controller itself is better.
>
>The swap_cgroup when the page(and page_cgroup) is allocated and
>the swap_cgroup when the page is going to be swapped out may be
>different by swap_cgroup_move_task(), so I think swap_cgroup
>to be charged should be determined at the point of swapout.
>
Accounting swap against an entity which allocs anon memory is
not strange. Problem here is move_task itself.
Now, charges against anon is not moved when a task which uses it
is moved. please fix this behavior first if you think this is
problematic.

But, finally, a daemon driven by process event connector
determines the group before process starts using anon. It's
doubtful that it's worth to add complicated/costly ones.


Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <6197904.1204808216900.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06 12:56             ` kamezawa.hiroyu
@ 2008-03-07  8:22                 ` Daisuke Nishimura
  -1 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-07  8:22 UTC (permalink / raw)
  To: kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, xemul-GEFAQzZX7r8dnm+yROfE0A,
	hugh-DTz5qymZ9yRBDgjK7y7TUQ,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Hi.

kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org wrote:
>>> At first look, remembering mm struct is not very good.
>>> Remembering swap controller itself is better.
>> The swap_cgroup when the page(and page_cgroup) is allocated and
>> the swap_cgroup when the page is going to be swapped out may be
>> different by swap_cgroup_move_task(), so I think swap_cgroup
>> to be charged should be determined at the point of swapout.
>>
> Accounting swap against an entity which allocs anon memory is
> not strange. Problem here is move_task itself.
> Now, charges against anon is not moved when a task which uses it
> is moved. please fix this behavior first if you think this is
> problematic.
> 
> But, finally, a daemon driven by process event connector
> determines the group before process starts using anon. It's
> doubtful that it's worth to add complicated/costly ones.
> 

I agree with you.

I think the current behavior of move_task is problematic,
and should fix it.
But fixing it would be difficult and add a costly process,
so I should consider more.


Thanks,
Daisuke Nishimura.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-07  8:22                 ` Daisuke Nishimura
  0 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-07  8:22 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: containers, linux-mm, balbir, xemul, hugh

Hi.

kamezawa.hiroyu@jp.fujitsu.com wrote:
>>> At first look, remembering mm struct is not very good.
>>> Remembering swap controller itself is better.
>> The swap_cgroup when the page(and page_cgroup) is allocated and
>> the swap_cgroup when the page is going to be swapped out may be
>> different by swap_cgroup_move_task(), so I think swap_cgroup
>> to be charged should be determined at the point of swapout.
>>
> Accounting swap against an entity which allocs anon memory is
> not strange. Problem here is move_task itself.
> Now, charges against anon is not moved when a task which uses it
> is moved. please fix this behavior first if you think this is
> problematic.
> 
> But, finally, a daemon driven by process event connector
> determines the group before process starts using anon. It's
> doubtful that it's worth to add complicated/costly ones.
> 

I agree with you.

I think the current behavior of move_task is problematic,
and should fix it.
But fixing it would be difficult and add a costly process,
so I should consider more.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06 12:56             ` kamezawa.hiroyu
@ 2008-03-12 22:57                 ` YAMAMOTO Takashi
  -1 siblings, 0 replies; 50+ messages in thread
From: YAMAMOTO Takashi @ 2008-03-12 22:57 UTC (permalink / raw)
  To: kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	containers-qjLDD68F18O7TbgM5vRIOg, hugh-DTz5qymZ9yRBDgjK7y7TUQ,
	xemul-GEFAQzZX7r8dnm+yROfE0A,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

> >> At first look, remembering mm struct is not very good.
> >> Remembering swap controller itself is better.
> >
> >The swap_cgroup when the page(and page_cgroup) is allocated and
> >the swap_cgroup when the page is going to be swapped out may be
> >different by swap_cgroup_move_task(), so I think swap_cgroup
> >to be charged should be determined at the point of swapout.
> >
> Accounting swap against an entity which allocs anon memory is
> not strange. Problem here is move_task itself.
> Now, charges against anon is not moved when a task which uses it
> is moved. please fix this behavior first if you think this is
> problematic.
> 
> But, finally, a daemon driven by process event connector
> determines the group before process starts using anon. It's
> doubtful that it's worth to add complicated/costly ones.
> 
> 
> Thanks,
> -Kame

doesn't PEC work asynchronously and allows processes to use
anonymous memory before being moved by the daemon?

YAMAMOTO Takashi

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-12 22:57                 ` YAMAMOTO Takashi
  0 siblings, 0 replies; 50+ messages in thread
From: YAMAMOTO Takashi @ 2008-03-12 22:57 UTC (permalink / raw)
  To: kamezawa.hiroyu; +Cc: nishimura, linux-mm, containers, hugh, balbir, xemul

> >> At first look, remembering mm struct is not very good.
> >> Remembering swap controller itself is better.
> >
> >The swap_cgroup when the page(and page_cgroup) is allocated and
> >the swap_cgroup when the page is going to be swapped out may be
> >different by swap_cgroup_move_task(), so I think swap_cgroup
> >to be charged should be determined at the point of swapout.
> >
> Accounting swap against an entity which allocs anon memory is
> not strange. Problem here is move_task itself.
> Now, charges against anon is not moved when a task which uses it
> is moved. please fix this behavior first if you think this is
> problematic.
> 
> But, finally, a daemon driven by process event connector
> determines the group before process starts using anon. It's
> doubtful that it's worth to add complicated/costly ones.
> 
> 
> Thanks,
> -Kame

doesn't PEC work asynchronously and allows processes to use
anonymous memory before being moved by the daemon?

YAMAMOTO Takashi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  5:59 ` Daisuke Nishimura
@ 2008-03-05  7:03     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05  7:03 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, xemul-GEFAQzZX7r8dnm+yROfE0A,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

On Wed, 05 Mar 2008 14:59:05 +0900
Daisuke Nishimura <nishimura-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org> wrote:

> +int swap_cgroup_charge(struct page *page,
> +			struct swap_info_struct *si,
> +			unsigned long offset)
> +{
> +	int ret;
> +	struct page_cgroup *pc;
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap;
> +
> +	BUG_ON(!page);
> +
> +	/*
> +	 * Pages to be swapped out should have been charged by memory cgroup,
> +	 * but very rarely, pc would be NULL (pc is not reliable without lock,
> +	 * so I should fix here).
> +	 * In such cases, we charge the init_mm now.
> +	 */
> +	pc = page_get_page_cgroup(page);
> +	if (WARN_ON(!pc))
> +		mm = &init_mm;
> +	else
> +		mm = pc->pc_mm;
> +	BUG_ON(!mm);
> +
> +	rcu_read_lock();
> +	swap = rcu_dereference(mm->swap_cgroup);
> +	rcu_read_unlock();
> +	BUG_ON(!swap);
> +
> +	ret = res_counter_charge(&swap->res, PAGE_SIZE);
> +	if (!ret) {
> +		css_get(&swap->css);
> +		si->swap_cgroup[offset] = swap;
> +	}
> +
I think it's better to reclaim swap_entry used for SwapCache but not in Harddisk
before failure.

Thanks,
-Kame

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  7:03     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-05  7:03 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: containers, linux-mm, balbir, xemul

On Wed, 05 Mar 2008 14:59:05 +0900
Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> wrote:

> +int swap_cgroup_charge(struct page *page,
> +			struct swap_info_struct *si,
> +			unsigned long offset)
> +{
> +	int ret;
> +	struct page_cgroup *pc;
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap;
> +
> +	BUG_ON(!page);
> +
> +	/*
> +	 * Pages to be swapped out should have been charged by memory cgroup,
> +	 * but very rarely, pc would be NULL (pc is not reliable without lock,
> +	 * so I should fix here).
> +	 * In such cases, we charge the init_mm now.
> +	 */
> +	pc = page_get_page_cgroup(page);
> +	if (WARN_ON(!pc))
> +		mm = &init_mm;
> +	else
> +		mm = pc->pc_mm;
> +	BUG_ON(!mm);
> +
> +	rcu_read_lock();
> +	swap = rcu_dereference(mm->swap_cgroup);
> +	rcu_read_unlock();
> +	BUG_ON(!swap);
> +
> +	ret = res_counter_charge(&swap->res, PAGE_SIZE);
> +	if (!ret) {
> +		css_get(&swap->css);
> +		si->swap_cgroup[offset] = swap;
> +	}
> +
I think it's better to reclaim swap_entry used for SwapCache but not in Harddisk
before failure.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  5:59 ` Daisuke Nishimura
@ 2008-03-05  7:28     ` Balbir Singh
  -1 siblings, 0 replies; 50+ messages in thread
From: Balbir Singh @ 2008-03-05  7:28 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Hugh Dickins,
	xemul-GEFAQzZX7r8dnm+yROfE0A

Daisuke Nishimura wrote:
> Hi.
> 
> Even if limiting memory usage by cgroup memory subsystem
> or isolating memory by cpuset, swap space is shared, so
> resource isolation is not enough. If one group uses up all the
> swap space, it can affect other groups.
> 

Yes, that is true. Please ensure that you also cc Hugh Dickins for all swap
related changes.

> I try making a patch of swap subsystem based on memory
> subsystem, which limits swap usage per cgroup.
> It can now charge and limit the swap usage.
> 
> I implemented this feature as a new subsystem,
> not as a part of memory subsystem, because I don't want to
> make big change to memcontrol.c, and even if implemented
> as other subsystem, users can manage memory and swap on
> the same cgroup directory if mount them together.
> 

I agree, the swap system should be independent of the memory resource controller.

> Basic idea of my implementation:
>   - what will be charged ?
>     the number of swap entries.
> 
>   - when to charge/uncharge ?
>     charge at get_swap_entry(), and uncharge at swap_entry_free().
> 

You mean get_swap_page(), I suppose. The assumption in the code is that every
swap page being charged has already been charged by the memory controller (that
will go against making the controllers independent). Also, be careful of any
charge operations under a spin_lock(). We tried controlling pages in the swap
cache, but Hugh found problems with it, specially due to accounting for pages
that are read ahead to the correct cgroup.

>   - to what group charge the swap entry ?
>     To determine to what swap_cgroup (corresponding to mem_cgroup in
>     memory subsystem) the swap entry should be charged,
>     I added a pointer to mm_struct to page_cgroup(pc->pc_mm), and
>     changed the argument of get_swap_entry() from (void) to
>     (struct page *). As a result, get_swap_entry() can determine
>     to what swap_cgroup it should charge the swap entry
>     by referring to page->page_cgroup->mm_struct->swap_cgroup.
> 

I presume this is for the case when the memory and swap controllers are mounted
in different hierarchies. It seems like too many dereferences to get to the
swap_cgroup

>   - from what group uncharge the swap entry ?
>     I added to swap_info_struct a member 'struct swap_cgroup **',
>     array of pointer to which swap_cgroup the swap entry is
>     charged.
> 
> Todo:
>   - rebase new kernel, and split into some patches.
>   - Merge with memory subsystem (if it would be better), or
>     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
>     (needs to make page_cgroup more generic one).
>   - More tests, cleanups, and feartures   :-)  
> 
> 
> Any comments or discussions would be appreciated.
> 

To be honest, I tried looking at the code, but there were too many #ifdefs and I
sort of lost myself in them.

> Thanks,
> Daisuke Nishimura
> 

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  7:28     ` Balbir Singh
  0 siblings, 0 replies; 50+ messages in thread
From: Balbir Singh @ 2008-03-05  7:28 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: containers, linux-mm, xemul, Hugh Dickins

Daisuke Nishimura wrote:
> Hi.
> 
> Even if limiting memory usage by cgroup memory subsystem
> or isolating memory by cpuset, swap space is shared, so
> resource isolation is not enough. If one group uses up all the
> swap space, it can affect other groups.
> 

Yes, that is true. Please ensure that you also cc Hugh Dickins for all swap
related changes.

> I try making a patch of swap subsystem based on memory
> subsystem, which limits swap usage per cgroup.
> It can now charge and limit the swap usage.
> 
> I implemented this feature as a new subsystem,
> not as a part of memory subsystem, because I don't want to
> make big change to memcontrol.c, and even if implemented
> as other subsystem, users can manage memory and swap on
> the same cgroup directory if mount them together.
> 

I agree, the swap system should be independent of the memory resource controller.

> Basic idea of my implementation:
>   - what will be charged ?
>     the number of swap entries.
> 
>   - when to charge/uncharge ?
>     charge at get_swap_entry(), and uncharge at swap_entry_free().
> 

You mean get_swap_page(), I suppose. The assumption in the code is that every
swap page being charged has already been charged by the memory controller (that
will go against making the controllers independent). Also, be careful of any
charge operations under a spin_lock(). We tried controlling pages in the swap
cache, but Hugh found problems with it, specially due to accounting for pages
that are read ahead to the correct cgroup.

>   - to what group charge the swap entry ?
>     To determine to what swap_cgroup (corresponding to mem_cgroup in
>     memory subsystem) the swap entry should be charged,
>     I added a pointer to mm_struct to page_cgroup(pc->pc_mm), and
>     changed the argument of get_swap_entry() from (void) to
>     (struct page *). As a result, get_swap_entry() can determine
>     to what swap_cgroup it should charge the swap entry
>     by referring to page->page_cgroup->mm_struct->swap_cgroup.
> 

I presume this is for the case when the memory and swap controllers are mounted
in different hierarchies. It seems like too many dereferences to get to the
swap_cgroup

>   - from what group uncharge the swap entry ?
>     I added to swap_info_struct a member 'struct swap_cgroup **',
>     array of pointer to which swap_cgroup the swap entry is
>     charged.
> 
> Todo:
>   - rebase new kernel, and split into some patches.
>   - Merge with memory subsystem (if it would be better), or
>     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
>     (needs to make page_cgroup more generic one).
>   - More tests, cleanups, and feartures   :-)  
> 
> 
> Any comments or discussions would be appreciated.
> 

To be honest, I tried looking at the code, but there were too many #ifdefs and I
sort of lost myself in them.

> Thanks,
> Daisuke Nishimura
> 

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CE4BB6.8050803-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  7:28     ` Balbir Singh
@ 2008-03-07  4:23         ` Daisuke Nishimura
  -1 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-07  4:23 UTC (permalink / raw)
  To: balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Hugh Dickins,
	xemul-GEFAQzZX7r8dnm+yROfE0A

Hi.

Balbir Singh wrote:
> Daisuke Nishimura wrote:
>> Basic idea of my implementation:
>>   - what will be charged ?
>>     the number of swap entries.
>>
>>   - when to charge/uncharge ?
>>     charge at get_swap_entry(), and uncharge at swap_entry_free().
>>
> 
> You mean get_swap_page(), I suppose. The assumption in the code is that every
> swap page being charged has already been charged by the memory controller (that
> will go against making the controllers independent). Also, be careful of any

To make swap-limit independent of memory subsystem, I think
page_cgroup code should be separated into two part:
subsystem-independent and subsystem-dependent, that is
part of associating page and page_cgroup and that of associating
page_cgroup and subsystem.

Rather than to do such a thing, I now think that
it would be better to implement swap-limit as part of
memory subsystem.


Thanks,
Daisuke Nishimura.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-07  4:23         ` Daisuke Nishimura
  0 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-07  4:23 UTC (permalink / raw)
  To: balbir; +Cc: containers, linux-mm, xemul, Hugh Dickins

Hi.

Balbir Singh wrote:
> Daisuke Nishimura wrote:
>> Basic idea of my implementation:
>>   - what will be charged ?
>>     the number of swap entries.
>>
>>   - when to charge/uncharge ?
>>     charge at get_swap_entry(), and uncharge at swap_entry_free().
>>
> 
> You mean get_swap_page(), I suppose. The assumption in the code is that every
> swap page being charged has already been charged by the memory controller (that
> will go against making the controllers independent). Also, be careful of any

To make swap-limit independent of memory subsystem, I think
page_cgroup code should be separated into two part:
subsystem-independent and subsystem-dependent, that is
part of associating page and page_cgroup and that of associating
page_cgroup and subsystem.

Rather than to do such a thing, I now think that
it would be better to implement swap-limit as part of
memory subsystem.


Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  5:59 ` Daisuke Nishimura
@ 2008-03-05  8:33     ` Pavel Emelyanov
  -1 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-05  8:33 UTC (permalink / raw)
  To: Daisuke Nishimura
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, xemul-GEFAQzZX7r8dnm+yROfE0A,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Daisuke Nishimura wrote:
> Hi.
> 
> Even if limiting memory usage by cgroup memory subsystem
> or isolating memory by cpuset, swap space is shared, so
> resource isolation is not enough. If one group uses up all the
> swap space, it can affect other groups.
> 
> I try making a patch of swap subsystem based on memory
> subsystem, which limits swap usage per cgroup.
> It can now charge and limit the swap usage.
> 
> I implemented this feature as a new subsystem,
> not as a part of memory subsystem, because I don't want to
> make big change to memcontrol.c, and even if implemented
> as other subsystem, users can manage memory and swap on
> the same cgroup directory if mount them together.
> 
> Basic idea of my implementation:
>   - what will be charged ?
>     the number of swap entries.

This is a very obscure thing "a swap entry" for the end user. People
would prefer accounting bytes.

>   - when to charge/uncharge ?
>     charge at get_swap_entry(), and uncharge at swap_entry_free().
> 
>   - to what group charge the swap entry ?
>     To determine to what swap_cgroup (corresponding to mem_cgroup in
>     memory subsystem) the swap entry should be charged,
>     I added a pointer to mm_struct to page_cgroup(pc->pc_mm), and
>     changed the argument of get_swap_entry() from (void) to
>     (struct page *). As a result, get_swap_entry() can determine
>     to what swap_cgroup it should charge the swap entry
>     by referring to page->page_cgroup->mm_struct->swap_cgroup.
> 
>   - from what group uncharge the swap entry ?
>     I added to swap_info_struct a member 'struct swap_cgroup **',
>     array of pointer to which swap_cgroup the swap entry is
>     charged.
> 
> Todo:
>   - rebase new kernel, and split into some patches.
>   - Merge with memory subsystem (if it would be better), or
>     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
>     (needs to make page_cgroup more generic one).

Merge is a must IMHO. I can hardly imagine a situation in which
someone would need these two separately.

>   - More tests, cleanups, and feartures   :-)  
> 
> 
> Any comments or discussions would be appreciated.
> 
> Thanks,
> Daisuke Nishimura
> 
> 
> Signed-off-by: Daisuke Nishimura <nishimura-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org>
> 
> ---
> diff -uprN linux-2.6.24-mm1/include/linux/cgroup_subsys.h linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h
> --- linux-2.6.24-mm1/include/linux/cgroup_subsys.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h	2008-03-03 10:56:56.000000000 +0900
> @@ -42,3 +42,9 @@ SUBSYS(mem_cgroup)
>  #endif
>  
>  /* */
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +SUBSYS(swap)
> +#endif
> +
> +/* */
> diff -uprN linux-2.6.24-mm1/include/linux/memcontrol.h linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h
> --- linux-2.6.24-mm1/include/linux/memcontrol.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h	2008-03-03 10:56:56.000000000 +0900
> @@ -29,6 +29,21 @@ struct page;
>  struct mm_struct;
>  
>  #ifdef CONFIG_CGROUP_MEM_CONT
> +/*
> + * A page_cgroup page is associated with every page descriptor. The
> + * page_cgroup helps us identify information about the cgroup
> + */
> +struct page_cgroup {
> +	struct list_head lru;		/* per cgroup LRU list */
> +	struct page *page;
> +	struct mem_cgroup *mem_cgroup;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct mm_struct *pc_mm;
> +#endif

Try not to add new entries here.

> +	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> +					/* mapped and cached states     */
> +	int	 flags;
> +};
>  
>  extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
>  extern void mm_free_cgroup(struct mm_struct *mm);
> diff -uprN linux-2.6.24-mm1/include/linux/mm_types.h linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h
> --- linux-2.6.24-mm1/include/linux/mm_types.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h	2008-03-03 10:56:56.000000000 +0900
> @@ -233,6 +233,9 @@ struct mm_struct {
>  #ifdef CONFIG_CGROUP_MEM_CONT
>  	struct mem_cgroup *mem_cgroup;
>  #endif
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup *swap_cgroup;
> +#endif
>  };
>  
>  #endif /* _LINUX_MM_TYPES_H */
> diff -uprN linux-2.6.24-mm1/include/linux/swap.h linux-2.6.24-mm1-swaplimit/include/linux/swap.h
> --- linux-2.6.24-mm1/include/linux/swap.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/swap.h	2008-03-03 10:56:56.000000000 +0900
> @@ -7,6 +7,7 @@
>  #include <linux/list.h>
>  #include <linux/memcontrol.h>
>  #include <linux/sched.h>
> +#include <linux/swap_limit.h>
>  
>  #include <asm/atomic.h>
>  #include <asm/page.h>
> @@ -141,6 +142,9 @@ struct swap_info_struct {
>  	struct swap_extent *curr_swap_extent;
>  	unsigned old_block_size;
>  	unsigned short * swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup **swap_cgroup;
> +#endif
>  	unsigned int lowest_bit;
>  	unsigned int highest_bit;
>  	unsigned int cluster_next;
> @@ -239,7 +243,7 @@ extern struct page *swapin_readahead(swp
>  extern long total_swap_pages;
>  extern unsigned int nr_swapfiles;
>  extern void si_swapinfo(struct sysinfo *);
> -extern swp_entry_t get_swap_page(void);
> +extern swp_entry_t get_swap_page(struct page *);
>  extern swp_entry_t get_swap_page_of_type(int);
>  extern int swap_duplicate(swp_entry_t);
>  extern int valid_swaphandles(swp_entry_t, unsigned long *);
> @@ -342,7 +346,7 @@ static inline int remove_exclusive_swap_
>  	return 0;
>  }
>  
> -static inline swp_entry_t get_swap_page(void)
> +static inline swp_entry_t get_swap_page(struct page *page)
>  {
>  	swp_entry_t entry;
>  	entry.val = 0;
> diff -uprN linux-2.6.24-mm1/include/linux/swap_limit.h linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h
> --- linux-2.6.24-mm1/include/linux/swap_limit.h	1970-01-01 09:00:00.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h	2008-03-03 10:56:56.000000000 +0900
> @@ -0,0 +1,65 @@
> +/*
> + * swap_limit.h
> + *
> + */
> +#ifndef _LINUX_SWAP_LIMIT_H
> +#define _LINUX_SWAP_LIMIT_H
> +
> +#include <linux/swap.h>
> +#include <linux/cgroup.h>
> +#include <linux/res_counter.h>
> +
> +struct swap_cgroup;
> +struct swap_info_struct;
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +struct swap_cgroup {
> +	struct cgroup_subsys_state css;
> +	struct res_counter res;
> +};
> +
> +static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
> +{
> +	return container_of(cgroup_subsys_state(cgrp, swap_subsys_id),
> +				struct swap_cgroup,
> +				css);
> +}
> +
> +static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
> +{
> +	return container_of(task_subsys_state(p, swap_subsys_id),
> +				struct swap_cgroup, css);
> +}
> +
> +extern int swap_cgroup_charge(struct page *page,
> +				struct swap_info_struct *si,
> +				unsigned long offset);
> +extern void swap_cgroup_uncharge(struct swap_info_struct *si,
> +				unsigned long offset);
> +
> +#else /* CONFIG_CGROUP_SWAP_LIMIT */
> +static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
> +{
> +	return NULL;
> +}
> +
> +static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
> +{
> +	return NULL;
> +}
> +
> +static inline int swap_cgroup_charge(struct page *page,
> +					struct swap_info_struct *si,
> +					unsigned long offset)
> +{
> +	return 0;
> +}
> +
> +static inline void swap_cgroup_uncharge(struct swap_info_struct *si,
> +					unsigned long offset)
> +{
> +}
> +
> +#endif
> +
> +#endif
> diff -uprN linux-2.6.24-mm1/init/Kconfig linux-2.6.24-mm1-swaplimit/init/Kconfig
> --- linux-2.6.24-mm1/init/Kconfig	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/init/Kconfig	2008-03-03 10:56:56.000000000 +0900
> @@ -383,6 +383,12 @@ config CGROUP_MEM_CONT
>  	  Provides a memory controller that manages both page cache and
>  	  RSS memory.
>  
> +config CGROUP_SWAP_LIMIT
> +	bool "cgroup subsystem for swap"
> +	depends on CGROUP_MEM_CONT && SWAP
> +	help
> +	  Provides a swap controller that manages and limits swap usage.
> +
>  config PROC_PID_CPUSET
>  	bool "Include legacy /proc/<pid>/cpuset file"
>  	depends on CPUSETS
> diff -uprN linux-2.6.24-mm1/mm/Makefile linux-2.6.24-mm1-swaplimit/mm/Makefile
> --- linux-2.6.24-mm1/mm/Makefile	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/Makefile	2008-03-03 10:56:56.000000000 +0900
> @@ -32,4 +32,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
> +obj-$(CONFIG_CGROUP_SWAP_LIMIT) += swap_limit.o
>  
> diff -uprN linux-2.6.24-mm1/mm/memcontrol.c linux-2.6.24-mm1-swaplimit/mm/memcontrol.c
> --- linux-2.6.24-mm1/mm/memcontrol.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/memcontrol.c	2008-03-03 10:56:56.000000000 +0900
> @@ -19,6 +19,7 @@
>  
>  #include <linux/res_counter.h>
>  #include <linux/memcontrol.h>
> +#include <linux/swap_limit.h>
>  #include <linux/cgroup.h>
>  #include <linux/mm.h>
>  #include <linux/smp.h>
> @@ -146,18 +147,6 @@ struct mem_cgroup {
>  #define PAGE_CGROUP_LOCK_BIT 	0x0
>  #define PAGE_CGROUP_LOCK 		(1 << PAGE_CGROUP_LOCK_BIT)
>  
> -/*
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - */
> -struct page_cgroup {
> -	struct list_head lru;		/* per cgroup LRU list */
> -	struct page *page;
> -	struct mem_cgroup *mem_cgroup;
> -	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> -					/* mapped and cached states     */
> -	int	 flags;
> -};
>  #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
>  #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
>  
> @@ -254,15 +243,27 @@ struct mem_cgroup *mem_cgroup_from_task(
>  void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
>  {
>  	struct mem_cgroup *mem;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup *swap;
> +#endif
>  
>  	mem = mem_cgroup_from_task(p);
>  	css_get(&mem->css);
>  	mm->mem_cgroup = mem;
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	swap = swap_cgroup_from_task(p);
> +	css_get(&swap->css);
> +	mm->swap_cgroup = swap;
> +#endif
>  }
>  
>  void mm_free_cgroup(struct mm_struct *mm)
>  {
>  	css_put(&mm->mem_cgroup->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	css_put(&mm->swap_cgroup->css);
> +#endif
>  }
>  
>  static inline int page_cgroup_locked(struct page *page)
> @@ -664,6 +665,10 @@ retry:
>  	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
>  		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	atomic_inc(&mm->mm_count);
> +	pc->pc_mm = mm;
> +#endif

What kernel is this patch for? I cannot find this code in 2.6.25-rc3-mm1

>  	if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
>  		/*
> @@ -673,6 +678,9 @@ retry:
>  		 */
>  		res_counter_uncharge(&mem->res, PAGE_SIZE);
>  		css_put(&mem->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +		mmdrop(mm);
> +#endif
>  		kfree(pc);
>  		if (!page)
>  			goto done;
> @@ -744,6 +752,9 @@ void mem_cgroup_uncharge(struct page_cgr
>  		if (clear_page_cgroup(page, pc) == pc) {
>  			mem = pc->mem_cgroup;
>  			css_put(&mem->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +			mmdrop(pc->pc_mm);
> +#endif
>  			res_counter_uncharge(&mem->res, PAGE_SIZE);
>  			spin_lock_irqsave(&mz->lru_lock, flags);
>  			__mem_cgroup_remove_list(pc);
> @@ -859,6 +870,9 @@ retry:
>  		atomic_set(&pc->ref_cnt, 0);
>  		if (clear_page_cgroup(page, pc) == pc) {
>  			css_put(&mem->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +			mmdrop(pc->pc_mm);
> +#endif
>  			res_counter_uncharge(&mem->res, PAGE_SIZE);
>  			__mem_cgroup_remove_list(pc);
>  			kfree(pc);
> diff -uprN linux-2.6.24-mm1/mm/shmem.c linux-2.6.24-mm1-swaplimit/mm/shmem.c
> --- linux-2.6.24-mm1/mm/shmem.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/shmem.c	2008-03-03 10:56:56.000000000 +0900
> @@ -1024,7 +1024,7 @@ static int shmem_writepage(struct page *
>  	 * want to check if there's a redundant swappage to be discarded.
>  	 */
>  	if (wbc->for_reclaim)
> -		swap = get_swap_page();
> +		swap = get_swap_page(page);
>  	else
>  		swap.val = 0;
>  
> diff -uprN linux-2.6.24-mm1/mm/swap_limit.c linux-2.6.24-mm1-swaplimit/mm/swap_limit.c
> --- linux-2.6.24-mm1/mm/swap_limit.c	1970-01-01 09:00:00.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/swap_limit.c	2008-03-05 14:39:23.000000000 +0900
> @@ -0,0 +1,194 @@
> +/*
> + * swap_limit.c - SWAP controller (based on memcontrol.c)
> + *
> + */
> +
> +#include <linux/err.h>
> +#include <linux/fs.h>
> +#include <linux/types.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/swap.h>
> +#include <linux/rcupdate.h>
> +#include <linux/cgroup.h>
> +#include <linux/res_counter.h>
> +#include <linux/memcontrol.h>
> +#include <linux/swap_limit.h>
> +
> +static struct swap_cgroup init_swap_cgroup;
> +
> +int swap_cgroup_charge(struct page *page,
> +			struct swap_info_struct *si,
> +			unsigned long offset)
> +{
> +	int ret;
> +	struct page_cgroup *pc;
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap;
> +
> +	BUG_ON(!page);
> +
> +	/*
> +	 * Pages to be swapped out should have been charged by memory cgroup,
> +	 * but very rarely, pc would be NULL (pc is not reliable without lock,
> +	 * so I should fix here).
> +	 * In such cases, we charge the init_mm now.
> +	 */
> +	pc = page_get_page_cgroup(page);
> +	if (WARN_ON(!pc))
> +		mm = &init_mm;
> +	else
> +		mm = pc->pc_mm;
> +	BUG_ON(!mm);
> +
> +	rcu_read_lock();
> +	swap = rcu_dereference(mm->swap_cgroup);
> +	rcu_read_unlock();
> +	BUG_ON(!swap);
> +
> +	ret = res_counter_charge(&swap->res, PAGE_SIZE);
> +	if (!ret) {
> +		css_get(&swap->css);
> +		si->swap_cgroup[offset] = swap;
> +	}
> +
> +	return ret;
> +}
> +
> +void swap_cgroup_uncharge(struct swap_info_struct *si, unsigned long offset)
> +{
> +	struct swap_cgroup *swap = si->swap_cgroup[offset];
> +
> +	/*
> +	 * "swap" would be NULL:
> +	 *  1. when get_swap_page() failed at charging swap_cgroup,
> +	 *     and called swap_entry_free().
> +	 *  2. when this swap entry had been assigned by
> +	 *     get_swap_page_of_type() (via SWSUSP ?).
> +	 */
> +	if (swap) {
> +		res_counter_uncharge(&swap->res, PAGE_SIZE);
> +		si->swap_cgroup[offset] = NULL;
> +		css_put(&swap->css);
> +	}
> +}
> +
> +static struct cgroup_subsys_state *swap_cgroup_create(struct cgroup_subsys *ss,
> +						      struct cgroup *cgrp)
> +{
> +	struct swap_cgroup *swap;
> +
> +	if (unlikely((cgrp->parent) == NULL)) {
> +		swap = &init_swap_cgroup;
> +		init_mm.swap_cgroup = swap;
> +	} else
> +		swap = kzalloc(sizeof(struct swap_cgroup), GFP_KERNEL);
> +
> +	if (swap == NULL)
> +		return ERR_PTR(-ENOMEM);
> +
> +	res_counter_init(&swap->res);
> +
> +	return &swap->css;
> +}
> +
> +static void swap_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +	kfree(swap_cgroup_from_cgrp(cgrp));
> +}
> +
> +static ssize_t swap_cgroup_read(struct cgroup *cgrp,
> +				struct cftype *cft, struct file *file,
> +				char __user *userbuf, size_t nbytes,
> +				loff_t *ppos)
> +{
> +	return res_counter_read(&swap_cgroup_from_cgrp(cgrp)->res,
> +				cft->private, userbuf, nbytes, ppos,
> +				NULL);
> +}
> +
> +static int swap_cgroup_write_strategy(char *buf, unsigned long long *tmp)
> +{
> +	*tmp = memparse(buf, &buf);
> +	if (*buf != '\0')
> +		return -EINVAL;
> +
> +	/*
> +	 * Round up the value to the closest page size
> +	 */
> +	*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
> +	return 0;
> +}
> +
> +static ssize_t swap_cgroup_write(struct cgroup *cgrp, struct cftype *cft,
> +				 struct file *file, const char __user *userbuf,
> +				 size_t nbytes, loff_t *ppos)
> +{
> +	return res_counter_write(&swap_cgroup_from_cgrp(cgrp)->res,
> +				 cft->private, userbuf, nbytes, ppos,
> +				 swap_cgroup_write_strategy);
> +}
> +
> +static struct cftype swap_files[] = {
> +	{
> +		.name = "usage_in_bytes",
> +		.private = RES_USAGE,
> +		.read = swap_cgroup_read,
> +	},
> +	{
> +		.name = "limit_in_bytes",
> +		.private = RES_LIMIT,
> +		.write = swap_cgroup_write,
> +		.read = swap_cgroup_read,
> +	},
> +	{
> +		.name = "failcnt",
> +		.private = RES_FAILCNT,
> +		.read = swap_cgroup_read,
> +	},
> +};
> +
> +static int swap_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +	return cgroup_add_files(cgrp, ss, swap_files, ARRAY_SIZE(swap_files));
> +}
> +
> +static void swap_cgroup_move_task(struct cgroup_subsys *ss,
> +				  struct cgroup *cgrp,
> +				  struct cgroup *old_cgrp,
> +				  struct task_struct *p)
> +{
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap, *old_swap;
> +
> +	mm = get_task_mm(p);
> +	if (mm == NULL)
> +		return;
> +
> +	swap = swap_cgroup_from_cgrp(cgrp);
> +	old_swap = swap_cgroup_from_cgrp(old_cgrp);
> +
> +	if (swap == old_swap)
> +		goto out;
> +
> +	if (p->tgid != p->pid)
> +		goto out;
> +
> +	css_get(&swap->css);
> +	rcu_assign_pointer(mm->swap_cgroup, swap);
> +	css_put(&old_swap->css);
> +
> +out:
> +	mmput(mm);
> +	return;
> +}
> +
> +struct cgroup_subsys swap_subsys = {
> +	.name = "swap",
> +	.create = swap_cgroup_create,
> +	.destroy = swap_cgroup_destroy,
> +	.populate = swap_cgroup_populate,
> +	.subsys_id = swap_subsys_id,
> +	.attach = swap_cgroup_move_task,
> +	.early_init = 0,
> +};
> diff -uprN linux-2.6.24-mm1/mm/swap_state.c linux-2.6.24-mm1-swaplimit/mm/swap_state.c
> --- linux-2.6.24-mm1/mm/swap_state.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/swap_state.c	2008-03-03 10:56:56.000000000 +0900
> @@ -128,7 +128,7 @@ int add_to_swap(struct page * page, gfp_
>  	BUG_ON(!PageUptodate(page));
>  
>  	for (;;) {
> -		entry = get_swap_page();
> +		entry = get_swap_page(page);
>  		if (!entry.val)
>  			return 0;
>  
> diff -uprN linux-2.6.24-mm1/mm/swapfile.c linux-2.6.24-mm1-swaplimit/mm/swapfile.c
> --- linux-2.6.24-mm1/mm/swapfile.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/swapfile.c	2008-03-03 10:56:56.000000000 +0900
> @@ -28,6 +28,7 @@
>  #include <linux/capability.h>
>  #include <linux/syscalls.h>
>  #include <linux/memcontrol.h>
> +#include <linux/swap_limit.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
> @@ -172,7 +173,10 @@ no_page:
>  	return 0;
>  }
>  
> -swp_entry_t get_swap_page(void)
> +/* get_swap_page() calls this */
> +static int swap_entry_free(struct swap_info_struct *, unsigned long);
> +
> +swp_entry_t get_swap_page(struct page *page)
>  {
>  	struct swap_info_struct *si;
>  	pgoff_t offset;
> @@ -201,6 +205,16 @@ swp_entry_t get_swap_page(void)
>  		swap_list.next = next;
>  		offset = scan_swap_map(si);
>  		if (offset) {
> +			/*
> +			 * This should be the first use of this swap entry,
> +			 * so charge this swap entry now.
> +			 */
> +			if (swap_cgroup_charge(page, si, offset)) {
> +				/* should free this entry */

:) Please, don't create comments, that duplicate the next line.

> +				swap_entry_free(si, offset);
> +
> +				goto noswap;
> +			}
>  			spin_unlock(&swap_lock);
>  			return swp_entry(type, offset);
>  		}
> @@ -285,6 +299,7 @@ static int swap_entry_free(struct swap_i
>  				swap_list.next = p - swap_info;
>  			nr_swap_pages++;
>  			p->inuse_pages--;
> +			swap_cgroup_uncharge(p, offset);
>  		}
>  	}
>  	return count;
> @@ -1207,6 +1222,9 @@ asmlinkage long sys_swapoff(const char _
>  {
>  	struct swap_info_struct * p = NULL;
>  	unsigned short *swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup **swap_cgroup;
> +#endif
>  	struct file *swap_file, *victim;
>  	struct address_space *mapping;
>  	struct inode *inode;
> @@ -1309,10 +1327,17 @@ asmlinkage long sys_swapoff(const char _
>  	p->max = 0;
>  	swap_map = p->swap_map;
>  	p->swap_map = NULL;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	swap_cgroup = p->swap_cgroup;
> +	p->swap_cgroup = NULL;
> +#endif
>  	p->flags = 0;
>  	spin_unlock(&swap_lock);
>  	mutex_unlock(&swapon_mutex);
>  	vfree(swap_map);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	vfree(swap_cgroup);
> +#endif
>  	inode = mapping->host;
>  	if (S_ISBLK(inode->i_mode)) {
>  		struct block_device *bdev = I_BDEV(inode);
> @@ -1460,6 +1485,9 @@ asmlinkage long sys_swapon(const char __
>  	unsigned long maxpages = 1;
>  	int swapfilesize;
>  	unsigned short *swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup **swap_cgroup;
> +#endif
>  	struct page *page = NULL;
>  	struct inode *inode = NULL;
>  	int did_down = 0;
> @@ -1483,6 +1511,9 @@ asmlinkage long sys_swapon(const char __
>  	p->swap_file = NULL;
>  	p->old_block_size = 0;
>  	p->swap_map = NULL;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	p->swap_cgroup = NULL;
> +#endif
>  	p->lowest_bit = 0;
>  	p->highest_bit = 0;
>  	p->cluster_nr = 0;
> @@ -1647,6 +1678,15 @@ asmlinkage long sys_swapon(const char __
>  				1 /* header page */;
>  		if (error)
>  			goto bad_swap;
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +		p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
> +		if (!(p->swap_cgroup)) {
> +			error = -ENOMEM;
> +			goto bad_swap;
> +		}
> +		memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
> +#endif
>  	}
>  
>  	if (nr_good_pages) {
> @@ -1704,13 +1744,22 @@ bad_swap:
>  bad_swap_2:
>  	spin_lock(&swap_lock);
>  	swap_map = p->swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	swap_cgroup = p->swap_cgroup;
> +#endif
>  	p->swap_file = NULL;
>  	p->swap_map = NULL;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	p->swap_cgroup = NULL;
> +#endif
>  	p->flags = 0;
>  	if (!(swap_flags & SWAP_FLAG_PREFER))
>  		++least_priority;
>  	spin_unlock(&swap_lock);
>  	vfree(swap_map);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	vfree(swap_cgroup);
> +#endif
>  	if (swap_file)
>  		filp_close(swap_file, NULL);
>  out:
> 
> 
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  8:33     ` Pavel Emelyanov
  0 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-05  8:33 UTC (permalink / raw)
  To: Daisuke Nishimura; +Cc: containers, linux-mm, balbir, xemul, kamezawa.hiroyu

Daisuke Nishimura wrote:
> Hi.
> 
> Even if limiting memory usage by cgroup memory subsystem
> or isolating memory by cpuset, swap space is shared, so
> resource isolation is not enough. If one group uses up all the
> swap space, it can affect other groups.
> 
> I try making a patch of swap subsystem based on memory
> subsystem, which limits swap usage per cgroup.
> It can now charge and limit the swap usage.
> 
> I implemented this feature as a new subsystem,
> not as a part of memory subsystem, because I don't want to
> make big change to memcontrol.c, and even if implemented
> as other subsystem, users can manage memory and swap on
> the same cgroup directory if mount them together.
> 
> Basic idea of my implementation:
>   - what will be charged ?
>     the number of swap entries.

This is a very obscure thing "a swap entry" for the end user. People
would prefer accounting bytes.

>   - when to charge/uncharge ?
>     charge at get_swap_entry(), and uncharge at swap_entry_free().
> 
>   - to what group charge the swap entry ?
>     To determine to what swap_cgroup (corresponding to mem_cgroup in
>     memory subsystem) the swap entry should be charged,
>     I added a pointer to mm_struct to page_cgroup(pc->pc_mm), and
>     changed the argument of get_swap_entry() from (void) to
>     (struct page *). As a result, get_swap_entry() can determine
>     to what swap_cgroup it should charge the swap entry
>     by referring to page->page_cgroup->mm_struct->swap_cgroup.
> 
>   - from what group uncharge the swap entry ?
>     I added to swap_info_struct a member 'struct swap_cgroup **',
>     array of pointer to which swap_cgroup the swap entry is
>     charged.
> 
> Todo:
>   - rebase new kernel, and split into some patches.
>   - Merge with memory subsystem (if it would be better), or
>     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
>     (needs to make page_cgroup more generic one).

Merge is a must IMHO. I can hardly imagine a situation in which
someone would need these two separately.

>   - More tests, cleanups, and feartures   :-)  
> 
> 
> Any comments or discussions would be appreciated.
> 
> Thanks,
> Daisuke Nishimura
> 
> 
> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
> 
> ---
> diff -uprN linux-2.6.24-mm1/include/linux/cgroup_subsys.h linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h
> --- linux-2.6.24-mm1/include/linux/cgroup_subsys.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/cgroup_subsys.h	2008-03-03 10:56:56.000000000 +0900
> @@ -42,3 +42,9 @@ SUBSYS(mem_cgroup)
>  #endif
>  
>  /* */
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +SUBSYS(swap)
> +#endif
> +
> +/* */
> diff -uprN linux-2.6.24-mm1/include/linux/memcontrol.h linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h
> --- linux-2.6.24-mm1/include/linux/memcontrol.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/memcontrol.h	2008-03-03 10:56:56.000000000 +0900
> @@ -29,6 +29,21 @@ struct page;
>  struct mm_struct;
>  
>  #ifdef CONFIG_CGROUP_MEM_CONT
> +/*
> + * A page_cgroup page is associated with every page descriptor. The
> + * page_cgroup helps us identify information about the cgroup
> + */
> +struct page_cgroup {
> +	struct list_head lru;		/* per cgroup LRU list */
> +	struct page *page;
> +	struct mem_cgroup *mem_cgroup;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct mm_struct *pc_mm;
> +#endif

Try not to add new entries here.

> +	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> +					/* mapped and cached states     */
> +	int	 flags;
> +};
>  
>  extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
>  extern void mm_free_cgroup(struct mm_struct *mm);
> diff -uprN linux-2.6.24-mm1/include/linux/mm_types.h linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h
> --- linux-2.6.24-mm1/include/linux/mm_types.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/mm_types.h	2008-03-03 10:56:56.000000000 +0900
> @@ -233,6 +233,9 @@ struct mm_struct {
>  #ifdef CONFIG_CGROUP_MEM_CONT
>  	struct mem_cgroup *mem_cgroup;
>  #endif
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup *swap_cgroup;
> +#endif
>  };
>  
>  #endif /* _LINUX_MM_TYPES_H */
> diff -uprN linux-2.6.24-mm1/include/linux/swap.h linux-2.6.24-mm1-swaplimit/include/linux/swap.h
> --- linux-2.6.24-mm1/include/linux/swap.h	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/swap.h	2008-03-03 10:56:56.000000000 +0900
> @@ -7,6 +7,7 @@
>  #include <linux/list.h>
>  #include <linux/memcontrol.h>
>  #include <linux/sched.h>
> +#include <linux/swap_limit.h>
>  
>  #include <asm/atomic.h>
>  #include <asm/page.h>
> @@ -141,6 +142,9 @@ struct swap_info_struct {
>  	struct swap_extent *curr_swap_extent;
>  	unsigned old_block_size;
>  	unsigned short * swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup **swap_cgroup;
> +#endif
>  	unsigned int lowest_bit;
>  	unsigned int highest_bit;
>  	unsigned int cluster_next;
> @@ -239,7 +243,7 @@ extern struct page *swapin_readahead(swp
>  extern long total_swap_pages;
>  extern unsigned int nr_swapfiles;
>  extern void si_swapinfo(struct sysinfo *);
> -extern swp_entry_t get_swap_page(void);
> +extern swp_entry_t get_swap_page(struct page *);
>  extern swp_entry_t get_swap_page_of_type(int);
>  extern int swap_duplicate(swp_entry_t);
>  extern int valid_swaphandles(swp_entry_t, unsigned long *);
> @@ -342,7 +346,7 @@ static inline int remove_exclusive_swap_
>  	return 0;
>  }
>  
> -static inline swp_entry_t get_swap_page(void)
> +static inline swp_entry_t get_swap_page(struct page *page)
>  {
>  	swp_entry_t entry;
>  	entry.val = 0;
> diff -uprN linux-2.6.24-mm1/include/linux/swap_limit.h linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h
> --- linux-2.6.24-mm1/include/linux/swap_limit.h	1970-01-01 09:00:00.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/include/linux/swap_limit.h	2008-03-03 10:56:56.000000000 +0900
> @@ -0,0 +1,65 @@
> +/*
> + * swap_limit.h
> + *
> + */
> +#ifndef _LINUX_SWAP_LIMIT_H
> +#define _LINUX_SWAP_LIMIT_H
> +
> +#include <linux/swap.h>
> +#include <linux/cgroup.h>
> +#include <linux/res_counter.h>
> +
> +struct swap_cgroup;
> +struct swap_info_struct;
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +struct swap_cgroup {
> +	struct cgroup_subsys_state css;
> +	struct res_counter res;
> +};
> +
> +static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
> +{
> +	return container_of(cgroup_subsys_state(cgrp, swap_subsys_id),
> +				struct swap_cgroup,
> +				css);
> +}
> +
> +static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
> +{
> +	return container_of(task_subsys_state(p, swap_subsys_id),
> +				struct swap_cgroup, css);
> +}
> +
> +extern int swap_cgroup_charge(struct page *page,
> +				struct swap_info_struct *si,
> +				unsigned long offset);
> +extern void swap_cgroup_uncharge(struct swap_info_struct *si,
> +				unsigned long offset);
> +
> +#else /* CONFIG_CGROUP_SWAP_LIMIT */
> +static inline struct swap_cgroup *swap_cgroup_from_cgrp(struct cgroup *cgrp)
> +{
> +	return NULL;
> +}
> +
> +static inline struct swap_cgroup *swap_cgroup_from_task(struct task_struct *p)
> +{
> +	return NULL;
> +}
> +
> +static inline int swap_cgroup_charge(struct page *page,
> +					struct swap_info_struct *si,
> +					unsigned long offset)
> +{
> +	return 0;
> +}
> +
> +static inline void swap_cgroup_uncharge(struct swap_info_struct *si,
> +					unsigned long offset)
> +{
> +}
> +
> +#endif
> +
> +#endif
> diff -uprN linux-2.6.24-mm1/init/Kconfig linux-2.6.24-mm1-swaplimit/init/Kconfig
> --- linux-2.6.24-mm1/init/Kconfig	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/init/Kconfig	2008-03-03 10:56:56.000000000 +0900
> @@ -383,6 +383,12 @@ config CGROUP_MEM_CONT
>  	  Provides a memory controller that manages both page cache and
>  	  RSS memory.
>  
> +config CGROUP_SWAP_LIMIT
> +	bool "cgroup subsystem for swap"
> +	depends on CGROUP_MEM_CONT && SWAP
> +	help
> +	  Provides a swap controller that manages and limits swap usage.
> +
>  config PROC_PID_CPUSET
>  	bool "Include legacy /proc/<pid>/cpuset file"
>  	depends on CPUSETS
> diff -uprN linux-2.6.24-mm1/mm/Makefile linux-2.6.24-mm1-swaplimit/mm/Makefile
> --- linux-2.6.24-mm1/mm/Makefile	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/Makefile	2008-03-03 10:56:56.000000000 +0900
> @@ -32,4 +32,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
> +obj-$(CONFIG_CGROUP_SWAP_LIMIT) += swap_limit.o
>  
> diff -uprN linux-2.6.24-mm1/mm/memcontrol.c linux-2.6.24-mm1-swaplimit/mm/memcontrol.c
> --- linux-2.6.24-mm1/mm/memcontrol.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/memcontrol.c	2008-03-03 10:56:56.000000000 +0900
> @@ -19,6 +19,7 @@
>  
>  #include <linux/res_counter.h>
>  #include <linux/memcontrol.h>
> +#include <linux/swap_limit.h>
>  #include <linux/cgroup.h>
>  #include <linux/mm.h>
>  #include <linux/smp.h>
> @@ -146,18 +147,6 @@ struct mem_cgroup {
>  #define PAGE_CGROUP_LOCK_BIT 	0x0
>  #define PAGE_CGROUP_LOCK 		(1 << PAGE_CGROUP_LOCK_BIT)
>  
> -/*
> - * A page_cgroup page is associated with every page descriptor. The
> - * page_cgroup helps us identify information about the cgroup
> - */
> -struct page_cgroup {
> -	struct list_head lru;		/* per cgroup LRU list */
> -	struct page *page;
> -	struct mem_cgroup *mem_cgroup;
> -	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
> -					/* mapped and cached states     */
> -	int	 flags;
> -};
>  #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
>  #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
>  
> @@ -254,15 +243,27 @@ struct mem_cgroup *mem_cgroup_from_task(
>  void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
>  {
>  	struct mem_cgroup *mem;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup *swap;
> +#endif
>  
>  	mem = mem_cgroup_from_task(p);
>  	css_get(&mem->css);
>  	mm->mem_cgroup = mem;
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	swap = swap_cgroup_from_task(p);
> +	css_get(&swap->css);
> +	mm->swap_cgroup = swap;
> +#endif
>  }
>  
>  void mm_free_cgroup(struct mm_struct *mm)
>  {
>  	css_put(&mm->mem_cgroup->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	css_put(&mm->swap_cgroup->css);
> +#endif
>  }
>  
>  static inline int page_cgroup_locked(struct page *page)
> @@ -664,6 +665,10 @@ retry:
>  	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
>  		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	atomic_inc(&mm->mm_count);
> +	pc->pc_mm = mm;
> +#endif

What kernel is this patch for? I cannot find this code in 2.6.25-rc3-mm1

>  	if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
>  		/*
> @@ -673,6 +678,9 @@ retry:
>  		 */
>  		res_counter_uncharge(&mem->res, PAGE_SIZE);
>  		css_put(&mem->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +		mmdrop(mm);
> +#endif
>  		kfree(pc);
>  		if (!page)
>  			goto done;
> @@ -744,6 +752,9 @@ void mem_cgroup_uncharge(struct page_cgr
>  		if (clear_page_cgroup(page, pc) == pc) {
>  			mem = pc->mem_cgroup;
>  			css_put(&mem->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +			mmdrop(pc->pc_mm);
> +#endif
>  			res_counter_uncharge(&mem->res, PAGE_SIZE);
>  			spin_lock_irqsave(&mz->lru_lock, flags);
>  			__mem_cgroup_remove_list(pc);
> @@ -859,6 +870,9 @@ retry:
>  		atomic_set(&pc->ref_cnt, 0);
>  		if (clear_page_cgroup(page, pc) == pc) {
>  			css_put(&mem->css);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +			mmdrop(pc->pc_mm);
> +#endif
>  			res_counter_uncharge(&mem->res, PAGE_SIZE);
>  			__mem_cgroup_remove_list(pc);
>  			kfree(pc);
> diff -uprN linux-2.6.24-mm1/mm/shmem.c linux-2.6.24-mm1-swaplimit/mm/shmem.c
> --- linux-2.6.24-mm1/mm/shmem.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/shmem.c	2008-03-03 10:56:56.000000000 +0900
> @@ -1024,7 +1024,7 @@ static int shmem_writepage(struct page *
>  	 * want to check if there's a redundant swappage to be discarded.
>  	 */
>  	if (wbc->for_reclaim)
> -		swap = get_swap_page();
> +		swap = get_swap_page(page);
>  	else
>  		swap.val = 0;
>  
> diff -uprN linux-2.6.24-mm1/mm/swap_limit.c linux-2.6.24-mm1-swaplimit/mm/swap_limit.c
> --- linux-2.6.24-mm1/mm/swap_limit.c	1970-01-01 09:00:00.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/swap_limit.c	2008-03-05 14:39:23.000000000 +0900
> @@ -0,0 +1,194 @@
> +/*
> + * swap_limit.c - SWAP controller (based on memcontrol.c)
> + *
> + */
> +
> +#include <linux/err.h>
> +#include <linux/fs.h>
> +#include <linux/types.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/swap.h>
> +#include <linux/rcupdate.h>
> +#include <linux/cgroup.h>
> +#include <linux/res_counter.h>
> +#include <linux/memcontrol.h>
> +#include <linux/swap_limit.h>
> +
> +static struct swap_cgroup init_swap_cgroup;
> +
> +int swap_cgroup_charge(struct page *page,
> +			struct swap_info_struct *si,
> +			unsigned long offset)
> +{
> +	int ret;
> +	struct page_cgroup *pc;
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap;
> +
> +	BUG_ON(!page);
> +
> +	/*
> +	 * Pages to be swapped out should have been charged by memory cgroup,
> +	 * but very rarely, pc would be NULL (pc is not reliable without lock,
> +	 * so I should fix here).
> +	 * In such cases, we charge the init_mm now.
> +	 */
> +	pc = page_get_page_cgroup(page);
> +	if (WARN_ON(!pc))
> +		mm = &init_mm;
> +	else
> +		mm = pc->pc_mm;
> +	BUG_ON(!mm);
> +
> +	rcu_read_lock();
> +	swap = rcu_dereference(mm->swap_cgroup);
> +	rcu_read_unlock();
> +	BUG_ON(!swap);
> +
> +	ret = res_counter_charge(&swap->res, PAGE_SIZE);
> +	if (!ret) {
> +		css_get(&swap->css);
> +		si->swap_cgroup[offset] = swap;
> +	}
> +
> +	return ret;
> +}
> +
> +void swap_cgroup_uncharge(struct swap_info_struct *si, unsigned long offset)
> +{
> +	struct swap_cgroup *swap = si->swap_cgroup[offset];
> +
> +	/*
> +	 * "swap" would be NULL:
> +	 *  1. when get_swap_page() failed at charging swap_cgroup,
> +	 *     and called swap_entry_free().
> +	 *  2. when this swap entry had been assigned by
> +	 *     get_swap_page_of_type() (via SWSUSP ?).
> +	 */
> +	if (swap) {
> +		res_counter_uncharge(&swap->res, PAGE_SIZE);
> +		si->swap_cgroup[offset] = NULL;
> +		css_put(&swap->css);
> +	}
> +}
> +
> +static struct cgroup_subsys_state *swap_cgroup_create(struct cgroup_subsys *ss,
> +						      struct cgroup *cgrp)
> +{
> +	struct swap_cgroup *swap;
> +
> +	if (unlikely((cgrp->parent) == NULL)) {
> +		swap = &init_swap_cgroup;
> +		init_mm.swap_cgroup = swap;
> +	} else
> +		swap = kzalloc(sizeof(struct swap_cgroup), GFP_KERNEL);
> +
> +	if (swap == NULL)
> +		return ERR_PTR(-ENOMEM);
> +
> +	res_counter_init(&swap->res);
> +
> +	return &swap->css;
> +}
> +
> +static void swap_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +	kfree(swap_cgroup_from_cgrp(cgrp));
> +}
> +
> +static ssize_t swap_cgroup_read(struct cgroup *cgrp,
> +				struct cftype *cft, struct file *file,
> +				char __user *userbuf, size_t nbytes,
> +				loff_t *ppos)
> +{
> +	return res_counter_read(&swap_cgroup_from_cgrp(cgrp)->res,
> +				cft->private, userbuf, nbytes, ppos,
> +				NULL);
> +}
> +
> +static int swap_cgroup_write_strategy(char *buf, unsigned long long *tmp)
> +{
> +	*tmp = memparse(buf, &buf);
> +	if (*buf != '\0')
> +		return -EINVAL;
> +
> +	/*
> +	 * Round up the value to the closest page size
> +	 */
> +	*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
> +	return 0;
> +}
> +
> +static ssize_t swap_cgroup_write(struct cgroup *cgrp, struct cftype *cft,
> +				 struct file *file, const char __user *userbuf,
> +				 size_t nbytes, loff_t *ppos)
> +{
> +	return res_counter_write(&swap_cgroup_from_cgrp(cgrp)->res,
> +				 cft->private, userbuf, nbytes, ppos,
> +				 swap_cgroup_write_strategy);
> +}
> +
> +static struct cftype swap_files[] = {
> +	{
> +		.name = "usage_in_bytes",
> +		.private = RES_USAGE,
> +		.read = swap_cgroup_read,
> +	},
> +	{
> +		.name = "limit_in_bytes",
> +		.private = RES_LIMIT,
> +		.write = swap_cgroup_write,
> +		.read = swap_cgroup_read,
> +	},
> +	{
> +		.name = "failcnt",
> +		.private = RES_FAILCNT,
> +		.read = swap_cgroup_read,
> +	},
> +};
> +
> +static int swap_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +	return cgroup_add_files(cgrp, ss, swap_files, ARRAY_SIZE(swap_files));
> +}
> +
> +static void swap_cgroup_move_task(struct cgroup_subsys *ss,
> +				  struct cgroup *cgrp,
> +				  struct cgroup *old_cgrp,
> +				  struct task_struct *p)
> +{
> +	struct mm_struct *mm;
> +	struct swap_cgroup *swap, *old_swap;
> +
> +	mm = get_task_mm(p);
> +	if (mm == NULL)
> +		return;
> +
> +	swap = swap_cgroup_from_cgrp(cgrp);
> +	old_swap = swap_cgroup_from_cgrp(old_cgrp);
> +
> +	if (swap == old_swap)
> +		goto out;
> +
> +	if (p->tgid != p->pid)
> +		goto out;
> +
> +	css_get(&swap->css);
> +	rcu_assign_pointer(mm->swap_cgroup, swap);
> +	css_put(&old_swap->css);
> +
> +out:
> +	mmput(mm);
> +	return;
> +}
> +
> +struct cgroup_subsys swap_subsys = {
> +	.name = "swap",
> +	.create = swap_cgroup_create,
> +	.destroy = swap_cgroup_destroy,
> +	.populate = swap_cgroup_populate,
> +	.subsys_id = swap_subsys_id,
> +	.attach = swap_cgroup_move_task,
> +	.early_init = 0,
> +};
> diff -uprN linux-2.6.24-mm1/mm/swap_state.c linux-2.6.24-mm1-swaplimit/mm/swap_state.c
> --- linux-2.6.24-mm1/mm/swap_state.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/swap_state.c	2008-03-03 10:56:56.000000000 +0900
> @@ -128,7 +128,7 @@ int add_to_swap(struct page * page, gfp_
>  	BUG_ON(!PageUptodate(page));
>  
>  	for (;;) {
> -		entry = get_swap_page();
> +		entry = get_swap_page(page);
>  		if (!entry.val)
>  			return 0;
>  
> diff -uprN linux-2.6.24-mm1/mm/swapfile.c linux-2.6.24-mm1-swaplimit/mm/swapfile.c
> --- linux-2.6.24-mm1/mm/swapfile.c	2008-02-04 14:34:24.000000000 +0900
> +++ linux-2.6.24-mm1-swaplimit/mm/swapfile.c	2008-03-03 10:56:56.000000000 +0900
> @@ -28,6 +28,7 @@
>  #include <linux/capability.h>
>  #include <linux/syscalls.h>
>  #include <linux/memcontrol.h>
> +#include <linux/swap_limit.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
> @@ -172,7 +173,10 @@ no_page:
>  	return 0;
>  }
>  
> -swp_entry_t get_swap_page(void)
> +/* get_swap_page() calls this */
> +static int swap_entry_free(struct swap_info_struct *, unsigned long);
> +
> +swp_entry_t get_swap_page(struct page *page)
>  {
>  	struct swap_info_struct *si;
>  	pgoff_t offset;
> @@ -201,6 +205,16 @@ swp_entry_t get_swap_page(void)
>  		swap_list.next = next;
>  		offset = scan_swap_map(si);
>  		if (offset) {
> +			/*
> +			 * This should be the first use of this swap entry,
> +			 * so charge this swap entry now.
> +			 */
> +			if (swap_cgroup_charge(page, si, offset)) {
> +				/* should free this entry */

:) Please, don't create comments, that duplicate the next line.

> +				swap_entry_free(si, offset);
> +
> +				goto noswap;
> +			}
>  			spin_unlock(&swap_lock);
>  			return swp_entry(type, offset);
>  		}
> @@ -285,6 +299,7 @@ static int swap_entry_free(struct swap_i
>  				swap_list.next = p - swap_info;
>  			nr_swap_pages++;
>  			p->inuse_pages--;
> +			swap_cgroup_uncharge(p, offset);
>  		}
>  	}
>  	return count;
> @@ -1207,6 +1222,9 @@ asmlinkage long sys_swapoff(const char _
>  {
>  	struct swap_info_struct * p = NULL;
>  	unsigned short *swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup **swap_cgroup;
> +#endif
>  	struct file *swap_file, *victim;
>  	struct address_space *mapping;
>  	struct inode *inode;
> @@ -1309,10 +1327,17 @@ asmlinkage long sys_swapoff(const char _
>  	p->max = 0;
>  	swap_map = p->swap_map;
>  	p->swap_map = NULL;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	swap_cgroup = p->swap_cgroup;
> +	p->swap_cgroup = NULL;
> +#endif
>  	p->flags = 0;
>  	spin_unlock(&swap_lock);
>  	mutex_unlock(&swapon_mutex);
>  	vfree(swap_map);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	vfree(swap_cgroup);
> +#endif
>  	inode = mapping->host;
>  	if (S_ISBLK(inode->i_mode)) {
>  		struct block_device *bdev = I_BDEV(inode);
> @@ -1460,6 +1485,9 @@ asmlinkage long sys_swapon(const char __
>  	unsigned long maxpages = 1;
>  	int swapfilesize;
>  	unsigned short *swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	struct swap_cgroup **swap_cgroup;
> +#endif
>  	struct page *page = NULL;
>  	struct inode *inode = NULL;
>  	int did_down = 0;
> @@ -1483,6 +1511,9 @@ asmlinkage long sys_swapon(const char __
>  	p->swap_file = NULL;
>  	p->old_block_size = 0;
>  	p->swap_map = NULL;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	p->swap_cgroup = NULL;
> +#endif
>  	p->lowest_bit = 0;
>  	p->highest_bit = 0;
>  	p->cluster_nr = 0;
> @@ -1647,6 +1678,15 @@ asmlinkage long sys_swapon(const char __
>  				1 /* header page */;
>  		if (error)
>  			goto bad_swap;
> +
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +		p->swap_cgroup = vmalloc(maxpages * sizeof(*swap_cgroup));
> +		if (!(p->swap_cgroup)) {
> +			error = -ENOMEM;
> +			goto bad_swap;
> +		}
> +		memset(p->swap_cgroup, 0, maxpages * sizeof(*swap_cgroup));
> +#endif
>  	}
>  
>  	if (nr_good_pages) {
> @@ -1704,13 +1744,22 @@ bad_swap:
>  bad_swap_2:
>  	spin_lock(&swap_lock);
>  	swap_map = p->swap_map;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	swap_cgroup = p->swap_cgroup;
> +#endif
>  	p->swap_file = NULL;
>  	p->swap_map = NULL;
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	p->swap_cgroup = NULL;
> +#endif
>  	p->flags = 0;
>  	if (!(swap_flags & SWAP_FLAG_PREFER))
>  		++least_priority;
>  	spin_unlock(&swap_lock);
>  	vfree(swap_map);
> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
> +	vfree(swap_cgroup);
> +#endif
>  	if (swap_file)
>  		filp_close(swap_file, NULL);
>  out:
> 
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CE5AE2.2050303-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  8:33     ` Pavel Emelyanov
@ 2008-03-05  8:51         ` Daisuke Nishimura
  -1 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-05  8:51 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, hugh-DTz5qymZ9yRBDgjK7y7TUQ,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Hi.

>> @@ -664,6 +665,10 @@ retry:
>>  	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
>>  		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
>> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>> +	atomic_inc(&mm->mm_count);
>> +	pc->pc_mm = mm;
>> +#endif
> 
> What kernel is this patch for? I cannot find this code in 2.6.25-rc3-mm1
> 
For linux-2.6.24-mm1.

Thanks,
Daisuke Nishimura.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05  8:51         ` Daisuke Nishimura
  0 siblings, 0 replies; 50+ messages in thread
From: Daisuke Nishimura @ 2008-03-05  8:51 UTC (permalink / raw)
  To: Pavel Emelyanov; +Cc: containers, linux-mm, balbir, kamezawa.hiroyu, hugh

Hi.

>> @@ -664,6 +665,10 @@ retry:
>>  	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
>>  	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
>>  		pc->flags |= PAGE_CGROUP_FLAG_CACHE;
>> +#ifdef CONFIG_CGROUP_SWAP_LIMIT
>> +	atomic_inc(&mm->mm_count);
>> +	pc->pc_mm = mm;
>> +#endif
> 
> What kernel is this patch for? I cannot find this code in 2.6.25-rc3-mm1
> 
For linux-2.6.24-mm1.

Thanks,
Daisuke Nishimura.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05  8:33     ` Pavel Emelyanov
@ 2008-03-05 14:07         ` Hugh Dickins
  -1 siblings, 0 replies; 50+ messages in thread
From: Hugh Dickins @ 2008-03-05 14:07 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

On Wed, 5 Mar 2008, Pavel Emelyanov wrote:
> Daisuke Nishimura wrote:
> > 
> > Todo:
> >   - rebase new kernel, and split into some patches.
> >   - Merge with memory subsystem (if it would be better), or
> >     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
> >     (needs to make page_cgroup more generic one).
> 
> Merge is a must IMHO. I can hardly imagine a situation in which
> someone would need these two separately.

Strongly agree.  Nobody's interested in swap as such: it's just
secondary memory, where RAM is primary memory.  People want to
control memory as the sum of the two; and I expect they may also
want to control primary memory (all that the current memcg does)
within that.  I wonder if such nesting of limits fits easily
into cgroups or will be problematic.

Hugh

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05 14:07         ` Hugh Dickins
  0 siblings, 0 replies; 50+ messages in thread
From: Hugh Dickins @ 2008-03-05 14:07 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Daisuke Nishimura, containers, linux-mm, balbir, kamezawa.hiroyu

On Wed, 5 Mar 2008, Pavel Emelyanov wrote:
> Daisuke Nishimura wrote:
> > 
> > Todo:
> >   - rebase new kernel, and split into some patches.
> >   - Merge with memory subsystem (if it would be better), or
> >     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
> >     (needs to make page_cgroup more generic one).
> 
> Merge is a must IMHO. I can hardly imagine a situation in which
> someone would need these two separately.

Strongly agree.  Nobody's interested in swap as such: it's just
secondary memory, where RAM is primary memory.  People want to
control memory as the sum of the two; and I expect they may also
want to control primary memory (all that the current memcg does)
within that.  I wonder if such nesting of limits fits easily
into cgroups or will be problematic.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <Pine.LNX.4.64.0803051400000.22243-popGQ1T0qN76K7/ahGyk6A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05 14:07         ` Hugh Dickins
@ 2008-03-05 14:14             ` Pavel Emelyanov
  -1 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-05 14:14 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: containers-qjLDD68F18O7TbgM5vRIOg,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Hugh Dickins wrote:
> On Wed, 5 Mar 2008, Pavel Emelyanov wrote:
>> Daisuke Nishimura wrote:
>>> Todo:
>>>   - rebase new kernel, and split into some patches.
>>>   - Merge with memory subsystem (if it would be better), or
>>>     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
>>>     (needs to make page_cgroup more generic one).
>> Merge is a must IMHO. I can hardly imagine a situation in which
>> someone would need these two separately.
> 
> Strongly agree.  Nobody's interested in swap as such: it's just
> secondary memory, where RAM is primary memory.  People want to
> control memory as the sum of the two; and I expect they may also
> want to control primary memory (all that the current memcg does)
> within that.  I wonder if such nesting of limits fits easily
> into cgroups or will be problematic.

This nesting would affect the res_couter abstraction, not the
cgroup infrastructure. Current design of resource counters doesn't
allow for such thing, but the extension is a couple-of-lines patch :)

> Hugh
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-05 14:14             ` Pavel Emelyanov
  0 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-05 14:14 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Daisuke Nishimura, containers, linux-mm, balbir, kamezawa.hiroyu

Hugh Dickins wrote:
> On Wed, 5 Mar 2008, Pavel Emelyanov wrote:
>> Daisuke Nishimura wrote:
>>> Todo:
>>>   - rebase new kernel, and split into some patches.
>>>   - Merge with memory subsystem (if it would be better), or
>>>     remove dependency on CONFIG_CGROUP_MEM_CONT if possible
>>>     (needs to make page_cgroup more generic one).
>> Merge is a must IMHO. I can hardly imagine a situation in which
>> someone would need these two separately.
> 
> Strongly agree.  Nobody's interested in swap as such: it's just
> secondary memory, where RAM is primary memory.  People want to
> control memory as the sum of the two; and I expect they may also
> want to control primary memory (all that the current memcg does)
> within that.  I wonder if such nesting of limits fits easily
> into cgroups or will be problematic.

This nesting would affect the res_couter abstraction, not the
cgroup infrastructure. Current design of resource counters doesn't
allow for such thing, but the extension is a couple-of-lines patch :)

> Hugh
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CEAAB4.8070208-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-05 14:14             ` Pavel Emelyanov
@ 2008-03-06  0:33                 ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06  0:33 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

On Wed, 05 Mar 2008 17:14:12 +0300
Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
> > Strongly agree.  Nobody's interested in swap as such: it's just
> > secondary memory, where RAM is primary memory.  People want to
> > control memory as the sum of the two; and I expect they may also
> > want to control primary memory (all that the current memcg does)
> > within that.  I wonder if such nesting of limits fits easily
> > into cgroups or will be problematic.
> 
> This nesting would affect the res_couter abstraction, not the
> cgroup infrastructure. Current design of resource counters doesn't
> allow for such thing, but the extension is a couple-of-lines patch :)
> 
IMHO, keeping res_counter simple is better.

Is this kind of new entry in mem_cgroup not good ?
==
struct mem_cgroup {
	...
	struct res_counter	memory_limit.
	struct res_counter	swap_limit.
	..
}

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  0:33                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06  0:33 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Hugh Dickins, Daisuke Nishimura, containers, linux-mm, balbir

On Wed, 05 Mar 2008 17:14:12 +0300
Pavel Emelyanov <xemul@openvz.org> wrote:
> > Strongly agree.  Nobody's interested in swap as such: it's just
> > secondary memory, where RAM is primary memory.  People want to
> > control memory as the sum of the two; and I expect they may also
> > want to control primary memory (all that the current memcg does)
> > within that.  I wonder if such nesting of limits fits easily
> > into cgroups or will be problematic.
> 
> This nesting would affect the res_couter abstraction, not the
> cgroup infrastructure. Current design of resource counters doesn't
> allow for such thing, but the extension is a couple-of-lines patch :)
> 
IMHO, keeping res_counter simple is better.

Is this kind of new entry in mem_cgroup not good ?
==
struct mem_cgroup {
	...
	struct res_counter	memory_limit.
	struct res_counter	swap_limit.
	..
}

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <20080306093324.77c6d7f4.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06  0:33                 ` KAMEZAWA Hiroyuki
@ 2008-03-06  0:35                     ` Paul Menage
  -1 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-06  0:35 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins, Pavel Emelyanov,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

On Wed, Mar 5, 2008 at 4:33 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org> wrote:
>  Is this kind of new entry in mem_cgroup not good ?
>  ==
>  struct mem_cgroup {
>         ...
>         struct res_counter      memory_limit.
>         struct res_counter      swap_limit.
>         ..

I agree with this - main memory and swap memory are rather different
kinds of resources, with very different performance characteristics.
It should be possible to control them completely independently (e.g.
this job gets 100M of main memory, and doesn't swap at all).

Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  0:35                     ` Paul Menage
  0 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-06  0:35 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Pavel Emelyanov, Hugh Dickins, Daisuke Nishimura, containers,
	linux-mm, balbir

On Wed, Mar 5, 2008 at 4:33 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
>  Is this kind of new entry in mem_cgroup not good ?
>  ==
>  struct mem_cgroup {
>         ...
>         struct res_counter      memory_limit.
>         struct res_counter      swap_limit.
>         ..

I agree with this - main memory and swap memory are rather different
kinds of resources, with very different performance characteristics.
It should be possible to control them completely independently (e.g.
this job gets 100M of main memory, and doesn't swap at all).

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06  0:33                 ` KAMEZAWA Hiroyuki
@ 2008-03-06  8:20                     ` Pavel Emelyanov
  -1 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06  8:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

KAMEZAWA Hiroyuki wrote:
> On Wed, 05 Mar 2008 17:14:12 +0300
> Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
>>> Strongly agree.  Nobody's interested in swap as such: it's just
>>> secondary memory, where RAM is primary memory.  People want to
>>> control memory as the sum of the two; and I expect they may also
>>> want to control primary memory (all that the current memcg does)
>>> within that.  I wonder if such nesting of limits fits easily
>>> into cgroups or will be problematic.
>> This nesting would affect the res_couter abstraction, not the
>> cgroup infrastructure. Current design of resource counters doesn't
>> allow for such thing, but the extension is a couple-of-lines patch :)
>>
> IMHO, keeping res_counter simple is better.
> 
> Is this kind of new entry in mem_cgroup not good ?
> ==
> struct mem_cgroup {
> 	...
> 	struct res_counter	memory_limit.
> 	struct res_counter	swap_limit.
> 	..
> }

I meant the same thing actually. By "nesting would affect" I
meant, that we might want to make res_counters hierarchical.

That would kill two birds with one stone - we will make a true
hierarchical memory accounting and let charging of two counters
with one call.

> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  8:20                     ` Pavel Emelyanov
  0 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06  8:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Hugh Dickins, Daisuke Nishimura, containers, linux-mm, balbir

KAMEZAWA Hiroyuki wrote:
> On Wed, 05 Mar 2008 17:14:12 +0300
> Pavel Emelyanov <xemul@openvz.org> wrote:
>>> Strongly agree.  Nobody's interested in swap as such: it's just
>>> secondary memory, where RAM is primary memory.  People want to
>>> control memory as the sum of the two; and I expect they may also
>>> want to control primary memory (all that the current memcg does)
>>> within that.  I wonder if such nesting of limits fits easily
>>> into cgroups or will be problematic.
>> This nesting would affect the res_couter abstraction, not the
>> cgroup infrastructure. Current design of resource counters doesn't
>> allow for such thing, but the extension is a couple-of-lines patch :)
>>
> IMHO, keeping res_counter simple is better.
> 
> Is this kind of new entry in mem_cgroup not good ?
> ==
> struct mem_cgroup {
> 	...
> 	struct res_counter	memory_limit.
> 	struct res_counter	swap_limit.
> 	..
> }

I meant the same thing actually. By "nesting would affect" I
meant, that we might want to make res_counters hierarchical.

That would kill two birds with one stone - we will make a true
hierarchical memory accounting and let charging of two counters
with one call.

> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CFA941.4070507-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06  8:20                     ` Pavel Emelyanov
@ 2008-03-06  8:33                         ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06  8:33 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

On Thu, 06 Mar 2008 11:20:17 +0300
Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Wed, 05 Mar 2008 17:14:12 +0300
> > Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
> >>> Strongly agree.  Nobody's interested in swap as such: it's just
> >>> secondary memory, where RAM is primary memory.  People want to
> >>> control memory as the sum of the two; and I expect they may also
> >>> want to control primary memory (all that the current memcg does)
> >>> within that.  I wonder if such nesting of limits fits easily
> >>> into cgroups or will be problematic.
> >> This nesting would affect the res_couter abstraction, not the
> >> cgroup infrastructure. Current design of resource counters doesn't
> >> allow for such thing, but the extension is a couple-of-lines patch :)
> >>
> > IMHO, keeping res_counter simple is better.
> > 
> > Is this kind of new entry in mem_cgroup not good ?
> > ==
> > struct mem_cgroup {
> > 	...
> > 	struct res_counter	memory_limit.
> > 	struct res_counter	swap_limit.
> > 	..
> > }
> 
> I meant the same thing actually. By "nesting would affect" I
> meant, that we might want to make res_counters hierarchical.
> 
> That would kill two birds with one stone - we will make a true
> hierarchical memory accounting and let charging of two counters
> with one call.

Hierarchical res_counter makes sense.
Making it in simple/reasonable style will be our challenge. 

Thanks,
-Kame

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  8:33                         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 50+ messages in thread
From: KAMEZAWA Hiroyuki @ 2008-03-06  8:33 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Hugh Dickins, Daisuke Nishimura, containers, linux-mm, balbir

On Thu, 06 Mar 2008 11:20:17 +0300
Pavel Emelyanov <xemul@openvz.org> wrote:

> KAMEZAWA Hiroyuki wrote:
> > On Wed, 05 Mar 2008 17:14:12 +0300
> > Pavel Emelyanov <xemul@openvz.org> wrote:
> >>> Strongly agree.  Nobody's interested in swap as such: it's just
> >>> secondary memory, where RAM is primary memory.  People want to
> >>> control memory as the sum of the two; and I expect they may also
> >>> want to control primary memory (all that the current memcg does)
> >>> within that.  I wonder if such nesting of limits fits easily
> >>> into cgroups or will be problematic.
> >> This nesting would affect the res_couter abstraction, not the
> >> cgroup infrastructure. Current design of resource counters doesn't
> >> allow for such thing, but the extension is a couple-of-lines patch :)
> >>
> > IMHO, keeping res_counter simple is better.
> > 
> > Is this kind of new entry in mem_cgroup not good ?
> > ==
> > struct mem_cgroup {
> > 	...
> > 	struct res_counter	memory_limit.
> > 	struct res_counter	swap_limit.
> > 	..
> > }
> 
> I meant the same thing actually. By "nesting would affect" I
> meant, that we might want to make res_counters hierarchical.
> 
> That would kill two birds with one stone - we will make a true
> hierarchical memory accounting and let charging of two counters
> with one call.

Hierarchical res_counter makes sense.
Making it in simple/reasonable style will be our challenge. 

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <20080306173347.f6c5c84c.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>]

* Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06  8:33                         ` KAMEZAWA Hiroyuki
@ 2008-03-06  8:38                             ` Pavel Emelyanov
  -1 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06  8:38 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

KAMEZAWA Hiroyuki wrote:
> On Thu, 06 Mar 2008 11:20:17 +0300
> Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Wed, 05 Mar 2008 17:14:12 +0300
>>> Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
>>>>> Strongly agree.  Nobody's interested in swap as such: it's just
>>>>> secondary memory, where RAM is primary memory.  People want to
>>>>> control memory as the sum of the two; and I expect they may also
>>>>> want to control primary memory (all that the current memcg does)
>>>>> within that.  I wonder if such nesting of limits fits easily
>>>>> into cgroups or will be problematic.
>>>> This nesting would affect the res_couter abstraction, not the
>>>> cgroup infrastructure. Current design of resource counters doesn't
>>>> allow for such thing, but the extension is a couple-of-lines patch :)
>>>>
>>> IMHO, keeping res_counter simple is better.
>>>
>>> Is this kind of new entry in mem_cgroup not good ?
>>> ==
>>> struct mem_cgroup {
>>> 	...
>>> 	struct res_counter	memory_limit.
>>> 	struct res_counter	swap_limit.
>>> 	..
>>> }
>> I meant the same thing actually. By "nesting would affect" I
>> meant, that we might want to make res_counters hierarchical.
>>
>> That would kill two birds with one stone - we will make a true
>> hierarchical memory accounting and let charging of two counters
>> with one call.
> 
> Hierarchical res_counter makes sense.
> Making it in simple/reasonable style will be our challenge. 

I have this in my TODO list. Since this is not so urgent, then if you
don't mind I can prepare the patches next week - after I set the git 
tree up. This change doesn't seem that big.

> Thanks,
> -Kame
> 
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  8:38                             ` Pavel Emelyanov
  0 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06  8:38 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki, balbir
  Cc: Hugh Dickins, Daisuke Nishimura, containers, linux-mm

KAMEZAWA Hiroyuki wrote:
> On Thu, 06 Mar 2008 11:20:17 +0300
> Pavel Emelyanov <xemul@openvz.org> wrote:
> 
>> KAMEZAWA Hiroyuki wrote:
>>> On Wed, 05 Mar 2008 17:14:12 +0300
>>> Pavel Emelyanov <xemul@openvz.org> wrote:
>>>>> Strongly agree.  Nobody's interested in swap as such: it's just
>>>>> secondary memory, where RAM is primary memory.  People want to
>>>>> control memory as the sum of the two; and I expect they may also
>>>>> want to control primary memory (all that the current memcg does)
>>>>> within that.  I wonder if such nesting of limits fits easily
>>>>> into cgroups or will be problematic.
>>>> This nesting would affect the res_couter abstraction, not the
>>>> cgroup infrastructure. Current design of resource counters doesn't
>>>> allow for such thing, but the extension is a couple-of-lines patch :)
>>>>
>>> IMHO, keeping res_counter simple is better.
>>>
>>> Is this kind of new entry in mem_cgroup not good ?
>>> ==
>>> struct mem_cgroup {
>>> 	...
>>> 	struct res_counter	memory_limit.
>>> 	struct res_counter	swap_limit.
>>> 	..
>>> }
>> I meant the same thing actually. By "nesting would affect" I
>> meant, that we might want to make res_counters hierarchical.
>>
>> That would kill two birds with one stone - we will make a true
>> hierarchical memory accounting and let charging of two counters
>> with one call.
> 
> Hierarchical res_counter makes sense.
> Making it in simple/reasonable style will be our challenge. 

I have this in my TODO list. Since this is not so urgent, then if you
don't mind I can prepare the patches next week - after I set the git 
tree up. This change doesn't seem that big.

> Thanks,
> -Kame
> 
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CFAD69.6000909-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>]

* Re: [Devel] Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06  8:38                             ` Pavel Emelyanov
@ 2008-03-06  8:48                                 ` Paul Menage
  -1 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-06  8:48 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

On Thu, Mar 6, 2008 at 12:38 AM, Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
>  > Hierarchical res_counter makes sense.
>  > Making it in simple/reasonable style will be our challenge.
>
>  I have this in my TODO list. Since this is not so urgent, then if you
>  don't mind I can prepare the patches next week - after I set the git
>  tree up. This change doesn't seem that big.
>

The change that you're referring to is allowing a cgroup to have a
total memory limit for itself and all its children, and then giving
that cgroup's children separate memory limits within that overall
limit?

Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Devel] Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  8:48                                 ` Paul Menage
  0 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-06  8:48 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: KAMEZAWA Hiroyuki, balbir, containers, Hugh Dickins, linux-mm

On Thu, Mar 6, 2008 at 12:38 AM, Pavel Emelyanov <xemul@openvz.org> wrote:
>  > Hierarchical res_counter makes sense.
>  > Making it in simple/reasonable style will be our challenge.
>
>  I have this in my TODO list. Since this is not so urgent, then if you
>  don't mind I can prepare the patches next week - after I set the git
>  tree up. This change doesn't seem that big.
>

The change that you're referring to is allowing a cgroup to have a
total memory limit for itself and all its children, and then giving
that cgroup's children separate memory limits within that overall
limit?

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <6599ad830803060048sb39735an765a62e6b928657e-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]

* Re: [Devel] Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06  8:48                                 ` Paul Menage
@ 2008-03-06  8:50                                     ` Pavel Emelyanov
  -1 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06  8:50 UTC (permalink / raw)
  To: Paul Menage
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Paul Menage wrote:
> On Thu, Mar 6, 2008 at 12:38 AM, Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
>>  > Hierarchical res_counter makes sense.
>>  > Making it in simple/reasonable style will be our challenge.
>>
>>  I have this in my TODO list. Since this is not so urgent, then if you
>>  don't mind I can prepare the patches next week - after I set the git
>>  tree up. This change doesn't seem that big.
>>
> 
> The change that you're referring to is allowing a cgroup to have a
> total memory limit for itself and all its children, and then giving
> that cgroup's children separate memory limits within that overall
> limit?

Yup. Isn't this reasonable? 

Without this, if I'm a task in a 1GB limited cgroup, I can create a new 
one, set 2GB limit and spawn a kid into it (or move there myself) and be 
happy with 2GB of memory... With the proposed change, even if I set a 2GB
for a subgroup it will not pass _my_ (1GB) limit.

> Paul
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Devel] Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  8:50                                     ` Pavel Emelyanov
  0 siblings, 0 replies; 50+ messages in thread
From: Pavel Emelyanov @ 2008-03-06  8:50 UTC (permalink / raw)
  To: Paul Menage; +Cc: KAMEZAWA Hiroyuki, balbir, containers, Hugh Dickins, linux-mm

Paul Menage wrote:
> On Thu, Mar 6, 2008 at 12:38 AM, Pavel Emelyanov <xemul@openvz.org> wrote:
>>  > Hierarchical res_counter makes sense.
>>  > Making it in simple/reasonable style will be our challenge.
>>
>>  I have this in my TODO list. Since this is not so urgent, then if you
>>  don't mind I can prepare the patches next week - after I set the git
>>  tree up. This change doesn't seem that big.
>>
> 
> The change that you're referring to is allowing a cgroup to have a
> total memory limit for itself and all its children, and then giving
> that cgroup's children separate memory limits within that overall
> limit?

Yup. Isn't this reasonable? 

Without this, if I'm a task in a 1GB limited cgroup, I can create a new 
one, set 2GB limit and spawn a kid into it (or move there myself) and be 
happy with 2GB of memory... With the proposed change, even if I set a 2GB
for a subgroup it will not pass _my_ (1GB) limit.

> Paul
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

[parent not found: <47CFB065.3080200-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>]

* Re: [Devel] Re: [RFC/PATCH] cgroup swap subsystem
  2008-03-06  8:50                                     ` Pavel Emelyanov
@ 2008-03-06  8:52                                         ` Paul Menage
  -1 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-06  8:52 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, Hugh Dickins,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

On Thu, Mar 6, 2008 at 12:50 AM, Pavel Emelyanov <xemul-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org> wrote:
>  > The change that you're referring to is allowing a cgroup to have a
>  > total memory limit for itself and all its children, and then giving
>  > that cgroup's children separate memory limits within that overall
>  > limit?
>
>  Yup. Isn't this reasonable?

Yes, sounds like a good plan.

Paul

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [Devel] Re: [RFC/PATCH] cgroup swap subsystem
@ 2008-03-06  8:52                                         ` Paul Menage
  0 siblings, 0 replies; 50+ messages in thread
From: Paul Menage @ 2008-03-06  8:52 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: KAMEZAWA Hiroyuki, balbir, containers, Hugh Dickins, linux-mm

On Thu, Mar 6, 2008 at 12:50 AM, Pavel Emelyanov <xemul@openvz.org> wrote:
>  > The change that you're referring to is allowing a cgroup to have a
>  > total memory limit for itself and all its children, and then giving
>  > that cgroup's children separate memory limits within that overall
>  > limit?
>
>  Yup. Isn't this reasonable?

Yes, sounds like a good plan.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2008-03-12 22:57 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-05  5:59 [RFC/PATCH] cgroup swap subsystem Daisuke Nishimura
2008-03-05  5:59 ` Daisuke Nishimura
     [not found] ` <47CE36A9.3060204-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org>
2008-03-05  6:36   ` Paul Menage
2008-03-05  6:36     ` Paul Menage
     [not found]     ` <6599ad830803042236x3e5fdf0dmaf4119997025ba40-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2008-03-06 12:20       ` Daisuke Nishimura
2008-03-06 12:20         ` Daisuke Nishimura
2008-03-05  6:53   ` KAMEZAWA Hiroyuki
2008-03-05  6:53     ` KAMEZAWA Hiroyuki
     [not found]     ` <20080305155329.60e02f48.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
2008-03-05 21:51       ` Hirokazu Takahashi
2008-03-05 21:51         ` Hirokazu Takahashi
2008-03-06 11:45       ` Daisuke Nishimura
2008-03-06 11:45         ` Daisuke Nishimura
     [not found]         ` <47CFD957.3060402-YQH0OdQVrdy45+QrQBaojngSJqDPrsil@public.gmane.org>
2008-03-06 12:25           ` Pavel Emelyanov
2008-03-06 12:25             ` Pavel Emelyanov
2008-03-06 12:56           ` kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A
2008-03-06 12:56             ` kamezawa.hiroyu
     [not found]             ` <6197904.1204808216900.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
2008-03-07  8:22               ` Daisuke Nishimura
2008-03-07  8:22                 ` Daisuke Nishimura
2008-03-12 22:57               ` YAMAMOTO Takashi
2008-03-12 22:57                 ` YAMAMOTO Takashi
2008-03-05  7:03   ` KAMEZAWA Hiroyuki
2008-03-05  7:03     ` KAMEZAWA Hiroyuki
2008-03-05  7:28   ` Balbir Singh
2008-03-05  7:28     ` Balbir Singh
     [not found]     ` <47CE4BB6.8050803-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2008-03-07  4:23       ` Daisuke Nishimura
2008-03-07  4:23         ` Daisuke Nishimura
2008-03-05  8:33   ` Pavel Emelyanov
2008-03-05  8:33     ` Pavel Emelyanov
     [not found]     ` <47CE5AE2.2050303-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-03-05  8:51       ` Daisuke Nishimura
2008-03-05  8:51         ` Daisuke Nishimura
2008-03-05 14:07       ` Hugh Dickins
2008-03-05 14:07         ` Hugh Dickins
     [not found]         ` <Pine.LNX.4.64.0803051400000.22243-popGQ1T0qN76K7/ahGyk6A@public.gmane.org>
2008-03-05 14:14           ` Pavel Emelyanov
2008-03-05 14:14             ` Pavel Emelyanov
     [not found]             ` <47CEAAB4.8070208-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-03-06  0:33               ` KAMEZAWA Hiroyuki
2008-03-06  0:33                 ` KAMEZAWA Hiroyuki
     [not found]                 ` <20080306093324.77c6d7f4.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
2008-03-06  0:35                   ` Paul Menage
2008-03-06  0:35                     ` Paul Menage
2008-03-06  8:20                   ` Pavel Emelyanov
2008-03-06  8:20                     ` Pavel Emelyanov
     [not found]                     ` <47CFA941.4070507-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-03-06  8:33                       ` KAMEZAWA Hiroyuki
2008-03-06  8:33                         ` KAMEZAWA Hiroyuki
     [not found]                         ` <20080306173347.f6c5c84c.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
2008-03-06  8:38                           ` Pavel Emelyanov
2008-03-06  8:38                             ` Pavel Emelyanov
     [not found]                             ` <47CFAD69.6000909-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-03-06  8:48                               ` [Devel] " Paul Menage
2008-03-06  8:48                                 ` Paul Menage
     [not found]                                 ` <6599ad830803060048sb39735an765a62e6b928657e-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2008-03-06  8:50                                   ` Pavel Emelyanov
2008-03-06  8:50                                     ` Pavel Emelyanov
     [not found]                                     ` <47CFB065.3080200-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
2008-03-06  8:52                                       ` Paul Menage
2008-03-06  8:52                                         ` Paul Menage

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.