From: Li Zefan <lizf@cn.fujitsu.com>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
xemul@openvz.org, "hugh@veritas.com" <hugh@veritas.com>
Subject: Re: [PATCH 5/7] radix-tree page cgroup
Date: Mon, 17 Mar 2008 11:56:06 +0900 [thread overview]
Message-ID: <47DDDDC6.2080808@cn.fujitsu.com> (raw)
In-Reply-To: <20080314191733.eff648f8.kamezawa.hiroyu@jp.fujitsu.com>
KAMEZAWA Hiroyuki wrote:
> A lookup routine for page_cgroup struct.
>
> Now, page_cgroup is pointed by struct page's page_cgroup entry
>
> struct page {
> ...
> struct page_cgroup *page_cgroup;
> ..
> }
>
> But some people dislike this because this increases sizeof(struct page).
>
> For avoiding that, we'll have to add a lookup routine for
> pfn <-> page_cgroup.
> by radix-tree.
>
> New function is
>
> struct page *get_page_cgroup(struct page *page, gfp_mask mask, bool allocate);
>
> if (allocate == true), look up and allocate new one if necessary.
> if (allocate == false), just do look up and return NULL if not exist.
>
> Changes:
> - add the 3rd argument 'allocate'
> - making page_cgroup chunk size to be configurable (for test.)
>
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> init/Kconfig | 14 ++++
> mm/Makefile | 2
> mm/page_cgroup.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 184 insertions(+), 1 deletion(-)
>
> Index: mm-2.6.25-rc5-mm1/mm/page_cgroup.c
> ===================================================================
> --- /dev/null
> +++ mm-2.6.25-rc5-mm1/mm/page_cgroup.c
> @@ -0,0 +1,173 @@
> +/*
> + * page_cgroup mamagement codes.
> + * page_cgroup is yet another mem_map when cgroup's memory resoruce controller
> + * is activated. It containes information which cannot be stored in usual
> + * mem_map. (it's too big.)
> + * This allows us to keep 'struct page' small when a user doesn't activate
> + * memory resource controller.
> + *
> + * Note: all things are allocated on demand.
> + *
> + * We can translate : struct page <-> pfn -> page_cgroup -> struct page.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <linux/radix-tree.h>
> +#include <linux/memcontrol.h>
> +#include <linux/page_cgroup.h>
> +#include <linux/err.h>
> +
> +
> +
> +#define PCGRP_SHIFT (CONFIG_CGROUP_PAGE_CGROUP_ORDER)
> +#define PCGRP_SIZE (1 << PCGRP_SHIFT)
> +
> +struct page_cgroup_head {
> + struct page_cgroup pc[PCGRP_SIZE];
> +};
> +
> +struct page_cgroup_root {
> + spinlock_t tree_lock;
> + struct radix_tree_root root_node;
> +};
> +
> +static struct page_cgroup_root *root_dir[MAX_NUMNODES];
> +
> +static void init_page_cgroup(struct page_cgroup_head *head, unsigned long pfn)
> +{
> + int i;
> + struct page_cgroup *pc;
> +
> + memset(head, 0, sizeof(*head));
> + for (i = 0; i < PCGRP_SIZE; ++i) {
Usually we use 'i++' in this case, gcc will take care of it.
> + pc = &head->pc[i];
> + pc->page = pfn_to_page(pfn + i);
> + spin_lock_init(&pc->lock);
> + INIT_LIST_HEAD(&pc->lru);
> + }
> +}
> +
> +
> +struct kmem_cache *page_cgroup_cachep;
> +
> +static struct page_cgroup_head *
> +alloc_init_page_cgroup(unsigned long pfn, int nid, gfp_t mask)
> +{
> + struct page_cgroup_head *head;
> +
> + head = kmem_cache_alloc_node(page_cgroup_cachep, mask, nid);
> + if (!head)
> + return NULL;
> +
> + init_page_cgroup(head, pfn);
> +
> + return head;
> +}
> +
> +void free_page_cgroup(struct page_cgroup_head *head)
> +{
> + kmem_cache_free(page_cgroup_cachep, head);
> +}
> +
> +
> +/*
> + * Look up page_cgroup struct for struct page (page's pfn)
> + * if (allocate == true), look up and allocate new one if necessary.
> + * if (allocate == false), look up and return NULL if it cannot be found.
> + */
> +
It's confusing when NULL will be returned and when -EFXXX...
if (allocate == true) -EFXXX may still be returned ?
> +struct page_cgroup *
> +get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
> +{
> + struct page_cgroup_root *root;
> + struct page_cgroup_head *head;
> + struct page_cgroup *pc;
> + unsigned long pfn, idx;
> + int nid;
> + unsigned long base_pfn, flags;
> + int error;
> +
> + if (!page)
> + return NULL;
> +
> + pfn = page_to_pfn(page);
> + idx = pfn >> PCGRP_SHIFT;
> + nid = page_to_nid(page);
> +
> + root = root_dir[nid];
> + /* Before Init ? */
> + if (unlikely(!root))
> + return NULL;
> +
> + base_pfn = idx << PCGRP_SHIFT;
> +retry:
> + error = 0;
> + rcu_read_lock();
> + head = radix_tree_lookup(&root->root_node, idx);
> + rcu_read_unlock();
> +
> + if (likely(head))
> + return &head->pc[pfn - base_pfn];
> + if (allocate == false)
> + return NULL;
> +
> + /* Very Slow Path. On demand allocation. */
> + gfpmask = gfpmask & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
> +
> + head = alloc_init_page_cgroup(base_pfn, nid, gfpmask);
> + if (!head)
> + return ERR_PTR(-ENOMEM);
> + pc = NULL;
> + error = radix_tree_preload(gfpmask);
> + if (error)
> + goto out;
> + spin_lock_irqsave(&root->tree_lock, flags);
> + error = radix_tree_insert(&root->root_node, idx, head);
> +
> + if (!error)
> + pc = &head->pc[pfn - base_pfn];
> + spin_unlock_irqrestore(&root->tree_lock, flags);
> + radix_tree_preload_end();
> +out:
> + if (!pc) {
> + free_page_cgroup(head);
> + if (error == -EEXIST)
> + goto retry;
> + }
> + if (error)
> + pc = ERR_PTR(error);
> + return pc;
> +}
> +
> +__init int page_cgroup_init(void)
> +{
> + int nid;
> + struct page_cgroup_root *root;
> +
> + page_cgroup_cachep = kmem_cache_create("page_cgroup",
> + sizeof(struct page_cgroup_head), 0,
> + SLAB_PANIC | SLAB_DESTROY_BY_RCU, NULL);
> + if (!page_cgroup_cachep) {
> + printk(KERN_ERR "page accouning setup failure\n");
> + printk(KERN_ERR "can't initialize slab memory\n");
> + /* FIX ME: should return some error code ? */
> + return 0;
why can't return -ENOMEM ?
> + }
> + for_each_online_node(nid) {
> + if (node_state(nid, N_NORMAL_MEMORY)
> + root = kmalloc_node(sizeof(struct page_cgroup_root),
> + GFP_KERNEL, nid);
if (root == NULL)
> + else
> + root = kmalloc(sizeof(struct page_cgroup_root),
> + GFP_KERNEL);
ditto
> + INIT_RADIX_TREE(&root->root_node, GFP_ATOMIC);
> + spin_lock_init(&root->tree_lock);
> + smp_wmb();
> + root_dir[nid] = root;
> + }
> +
> + printk(KERN_INFO "Page Accouintg is activated\n");
> + return 0;
> +}
> +late_initcall(page_cgroup_init);
> Index: mm-2.6.25-rc5-mm1/mm/Makefile
> ===================================================================
> --- mm-2.6.25-rc5-mm1.orig/mm/Makefile
> +++ mm-2.6.25-rc5-mm1/mm/Makefile
> @@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
> obj-$(CONFIG_MIGRATION) += migrate.o
> obj-$(CONFIG_SMP) += allocpercpu.o
> obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
>
> Index: mm-2.6.25-rc5-mm1/init/Kconfig
> ===================================================================
> --- mm-2.6.25-rc5-mm1.orig/init/Kconfig
> +++ mm-2.6.25-rc5-mm1/init/Kconfig
> @@ -405,6 +405,20 @@ config SYSFS_DEPRECATED_V2
> If you are using a distro with the most recent userspace
> packages, it should be safe to say N here.
>
> +config CGROUP_PAGE_CGROUP_ORDER
> + int "Order of page accounting subsystem"
> + range 0 10
> + default 3 if HIGHMEM64G
> + default 10 if 64BIT
> + default 7
> + depends on CGROUP_MEM_RES_CTLR
> + help
> + By making this value to be small, wastes in memory usage of page
> + accounting can be small. But big number is good for perfomance.
s/perfomance/performance
> + Especially, HIGHMEM64G users should keep this to be small because
> + you tend to have small kernel memory.
> + If unsure, use default.
> +
> config PROC_PID_CPUSET
> bool "Include legacy /proc/<pid>/cpuset file"
> depends on CPUSETS
>
> --
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2008-03-17 2:56 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-03-14 9:59 [PATCH 0/7] memcg: radix-tree page_cgroup KAMEZAWA Hiroyuki
2008-03-14 10:03 ` [PATCH 1/7] re-define page_cgroup KAMEZAWA Hiroyuki
2008-03-16 14:15 ` Balbir Singh
2008-03-18 1:10 ` KAMEZAWA Hiroyuki
2008-03-17 0:21 ` Li Zefan
2008-03-18 1:12 ` KAMEZAWA Hiroyuki
2008-03-17 2:07 ` Li Zefan
2008-03-18 1:11 ` KAMEZAWA Hiroyuki
2008-03-14 10:06 ` [PATCH 2/7] charge/uncharge KAMEZAWA Hiroyuki
2008-03-17 1:46 ` Balbir Singh
2008-03-18 1:14 ` KAMEZAWA Hiroyuki
2008-03-17 2:26 ` Li Zefan
2008-03-18 1:15 ` KAMEZAWA Hiroyuki
2008-03-14 10:07 ` [PATCH 3/7] memcg: move_lists KAMEZAWA Hiroyuki
2008-03-18 16:44 ` Balbir Singh
2008-03-19 2:34 ` KAMEZAWA Hiroyuki
2008-03-14 10:15 ` [PATCH 4/7] memcg: page migration KAMEZAWA Hiroyuki
2008-03-17 2:36 ` Li Zefan
2008-03-18 1:17 ` KAMEZAWA Hiroyuki
2008-03-18 18:11 ` Balbir Singh
2008-03-19 2:44 ` KAMEZAWA Hiroyuki
2008-03-14 10:17 ` [PATCH 5/7] radix-tree page cgroup KAMEZAWA Hiroyuki
2008-03-17 2:56 ` Li Zefan [this message]
2008-03-17 3:26 ` Li Zefan
2008-03-18 1:18 ` KAMEZAWA Hiroyuki
2008-03-18 1:23 ` KAMEZAWA Hiroyuki
2008-03-19 2:05 ` Balbir Singh
2008-03-19 2:51 ` KAMEZAWA Hiroyuki
2008-03-19 3:14 ` Balbir Singh
2008-03-19 3:24 ` KAMEZAWA Hiroyuki
2008-03-19 21:11 ` Peter Zijlstra
2008-03-20 4:45 ` KAMEZAWA Hiroyuki
2008-03-20 5:09 ` KAMEZAWA Hiroyuki
2008-03-14 10:18 ` [PATCH 6/7] memcg: speed up by percpu KAMEZAWA Hiroyuki
2008-03-17 3:03 ` Li Zefan
2008-03-18 1:25 ` KAMEZAWA Hiroyuki
2008-03-18 23:55 ` Li Zefan
2008-03-19 2:51 ` KAMEZAWA Hiroyuki
2008-03-19 21:19 ` Peter Zijlstra
2008-03-19 21:41 ` Peter Zijlstra
2008-03-20 9:08 ` Andy Whitcroft
2008-03-20 4:46 ` KAMEZAWA Hiroyuki
2008-03-14 10:22 ` [PATCH 7/7] memcg: freeing page_cgroup at suitable chance KAMEZAWA Hiroyuki
2008-03-17 3:10 ` Li Zefan
2008-03-18 1:30 ` KAMEZAWA Hiroyuki
2008-03-19 21:33 ` Peter Zijlstra
2008-03-20 5:07 ` KAMEZAWA Hiroyuki
2008-03-20 7:55 ` Peter Zijlstra
2008-03-20 14:49 ` kamezawa.hiroyu
2008-03-20 16:04 ` kamezawa.hiroyu
2008-03-20 16:09 ` Peter Zijlstra
2008-03-20 16:15 ` kamezawa.hiroyu
2008-03-15 6:15 ` [PATCH 0/7] memcg: radix-tree page_cgroup Balbir Singh
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=47DDDDC6.2080808@cn.fujitsu.com \
--to=lizf@cn.fujitsu.com \
--cc=balbir@linux.vnet.ibm.com \
--cc=hugh@veritas.com \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-mm@kvack.org \
--cc=xemul@openvz.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.