Re: [PATCH 5/7] radix-tree page cgroup

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "linux-mm@kvack.org" <linux-mm@kvack.org>,
	"balbir@linux.vnet.ibm.com" <balbir@linux.vnet.ibm.com>,
	xemul@openvz.org, "hugh@veritas.com" <hugh@veritas.com>
Subject: Re: [PATCH 5/7] radix-tree page cgroup
Date: Wed, 19 Mar 2008 22:11:06 +0100	[thread overview]
Message-ID: <1205961066.6437.10.camel@lappy> (raw)
In-Reply-To: <20080314191733.eff648f8.kamezawa.hiroyu@jp.fujitsu.com>

On Fri, 2008-03-14 at 19:17 +0900, KAMEZAWA Hiroyuki wrote:
> A lookup routine for page_cgroup struct.
> 
> Now, page_cgroup is pointed by struct page's page_cgroup entry
> 
> struct page {
> 	...
> 	struct page_cgroup *page_cgroup;
> 	..
> }
> 
> But some people dislike this because this increases sizeof(struct page).
> 
> For avoiding that, we'll have to add a lookup routine for
> 	pfn <-> page_cgroup.
> by radix-tree.

Good stuff.

> New function is
> 
> struct page *get_page_cgroup(struct page *page, gfp_mask mask, bool allocate);
> 
> if (allocate == true), look up and allocate new one if necessary.
> if (allocate == false), just do look up and return NULL if not exist.

I think others said as well, but we generally just write

 if (allocate)

 if (!allocate)


> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> 
>  init/Kconfig     |   14 ++++
>  mm/Makefile      |    2 
>  mm/page_cgroup.c |  169 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 184 insertions(+), 1 deletion(-)
> 
> Index: mm-2.6.25-rc5-mm1/mm/page_cgroup.c
> ===================================================================
> --- /dev/null
> +++ mm-2.6.25-rc5-mm1/mm/page_cgroup.c
> @@ -0,0 +1,173 @@
> +/*
> + * page_cgroup mamagement codes.
> + * page_cgroup is yet another mem_map when cgroup's memory resoruce controller
> + * is activated. It containes information which cannot be stored in usual
> + * mem_map. (it's too big.)
> + * This allows us to keep 'struct page' small when a user doesn't activate
> + * memory resource controller.
> + *
> + * Note: all things are allocated on demand.
> + *
> + * We can translate : struct page <-> pfn -> page_cgroup -> struct page.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/slab.h>
> +#include <linux/radix-tree.h>
> +#include <linux/memcontrol.h>
> +#include <linux/page_cgroup.h>
> +#include <linux/err.h>
> +
> +
> +
> +#define PCGRP_SHIFT	(CONFIG_CGROUP_PAGE_CGROUP_ORDER)
> +#define PCGRP_SIZE	(1 << PCGRP_SHIFT)
> +
> +struct page_cgroup_head {
> +	struct page_cgroup pc[PCGRP_SIZE];
> +};
> +
> +struct page_cgroup_root {
> +	spinlock_t	       tree_lock;
> +	struct radix_tree_root root_node;
> +};
> +
> +static struct page_cgroup_root *root_dir[MAX_NUMNODES];
> +
> +static void init_page_cgroup(struct page_cgroup_head *head, unsigned long pfn)
> +{
> +	int i;
> +	struct page_cgroup *pc;
> +
> +	memset(head, 0, sizeof(*head));
> +	for (i = 0; i < PCGRP_SIZE; ++i) {
> +		pc = &head->pc[i];
> +		pc->page = pfn_to_page(pfn + i);
> +		spin_lock_init(&pc->lock);
> +		INIT_LIST_HEAD(&pc->lru);
> +	}
> +}
> +
> +
> +struct kmem_cache *page_cgroup_cachep;
> +
> +static struct page_cgroup_head *
> +alloc_init_page_cgroup(unsigned long pfn, int nid, gfp_t mask)
> +{
> +	struct page_cgroup_head *head;
> +
> +	head = kmem_cache_alloc_node(page_cgroup_cachep, mask, nid);
> +	if (!head)
> +		return NULL;
> +
> +	init_page_cgroup(head, pfn);

Just because I'm lazy, I'll suggest the shorter:

if (head)
   init_page_cgroup(head, pfn)

> +
> +	return head;
> +}
> +
> +void free_page_cgroup(struct page_cgroup_head *head)
> +{
> +	kmem_cache_free(page_cgroup_cachep, head);
> +}
> +
> +
> +/*
> + * Look up page_cgroup struct for struct page (page's pfn)
> + * if (allocate == true), look up and allocate new one if necessary.
> + * if (allocate == false), look up and return NULL if it cannot be found.
> + */
> +
> +struct page_cgroup *
> +get_page_cgroup(struct page *page, gfp_t gfpmask, bool allocate)
> +{
> +	struct page_cgroup_root *root;
> +	struct page_cgroup_head *head;
> +	struct page_cgroup *pc;
> +	unsigned long pfn, idx;
> +	int nid;
> +	unsigned long base_pfn, flags;
> +	int error;

Would a this make sense? :

  might_sleep_if(allocate && (gfp_mask & __GFP_WAIT));

> +	if (!page)
> +		return NULL;
> +
> +	pfn = page_to_pfn(page);
> +	idx = pfn >> PCGRP_SHIFT;
> +	nid = page_to_nid(page);
> +
> +	root = root_dir[nid];
> +	/* Before Init ? */
> +	if (unlikely(!root))
> +		return NULL;
> +
> +	base_pfn = idx << PCGRP_SHIFT;
> +retry:
> +	error = 0;
> +	rcu_read_lock();
> +	head = radix_tree_lookup(&root->root_node, idx);
> +	rcu_read_unlock();

This looks iffy, who protects head here?

> +	if (likely(head))
> +		return &head->pc[pfn - base_pfn];
> +	if (allocate == false)
> +		return NULL;
> +
> +	/* Very Slow Path. On demand allocation. */
> +	gfpmask = gfpmask & ~(__GFP_HIGHMEM | __GFP_MOVABLE);
> +
> +	head = alloc_init_page_cgroup(base_pfn, nid, gfpmask);
> +	if (!head)
> +		return ERR_PTR(-ENOMEM);
> +	pc = NULL;
> +	error = radix_tree_preload(gfpmask);
> +	if (error)
> +		goto out;
> +	spin_lock_irqsave(&root->tree_lock, flags);
> +	error = radix_tree_insert(&root->root_node, idx, head);
> +
> +	if (!error)
> +		pc = &head->pc[pfn - base_pfn];
> +	spin_unlock_irqrestore(&root->tree_lock, flags);
> +	radix_tree_preload_end();
> +out:
> +	if (!pc) {
> +		free_page_cgroup(head);
> +		if (error == -EEXIST)
> +			goto retry;
> +	}
> +	if (error)
> +		pc = ERR_PTR(error);
> +	return pc;
> +}
> +
> +__init int page_cgroup_init(void)
> +{
> +	int nid;
> +	struct page_cgroup_root *root;
> +
> +	page_cgroup_cachep = kmem_cache_create("page_cgroup",
> +				sizeof(struct page_cgroup_head), 0,
> +				SLAB_PANIC | SLAB_DESTROY_BY_RCU, NULL);
> +	if (!page_cgroup_cachep) {
> +		printk(KERN_ERR "page accouning setup failure\n");
> +		printk(KERN_ERR "can't initialize slab memory\n");
> +		/* FIX ME: should return some error code ? */
> +		return 0;
> +	}
> +	for_each_online_node(nid) {
> +		if (node_state(nid, N_NORMAL_MEMORY)
> +			root = kmalloc_node(sizeof(struct page_cgroup_root),
> +					GFP_KERNEL, nid);
> +		else
> +			root = kmalloc(sizeof(struct page_cgroup_root),
> +					GFP_KERNEL);

if (!node_state(nid, N_NORMAL_MEMORY))
  nid = -1;

allows us to use a single kmalloc_node() statement.

> +		INIT_RADIX_TREE(&root->root_node, GFP_ATOMIC);
> +		spin_lock_init(&root->tree_lock);
> +		smp_wmb();

unadorned barrier; we usually require a comment outlining the race, and
a reference to the matching barrier.

> +		root_dir[nid] = root;
> +	}
> +
> +	printk(KERN_INFO "Page Accouintg is activated\n");
> +	return 0;
> +}
> +late_initcall(page_cgroup_init);
> Index: mm-2.6.25-rc5-mm1/mm/Makefile
> ===================================================================
> --- mm-2.6.25-rc5-mm1.orig/mm/Makefile
> +++ mm-2.6.25-rc5-mm1/mm/Makefile
> @@ -32,5 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
>  obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
>  
> Index: mm-2.6.25-rc5-mm1/init/Kconfig
> ===================================================================
> --- mm-2.6.25-rc5-mm1.orig/init/Kconfig
> +++ mm-2.6.25-rc5-mm1/init/Kconfig
> @@ -405,6 +405,20 @@ config SYSFS_DEPRECATED_V2
>  	  If you are using a distro with the most recent userspace
>  	  packages, it should be safe to say N here.
>  
> +config CGROUP_PAGE_CGROUP_ORDER
> +	int "Order of page accounting subsystem"
> +	range 0 10
> +	default 3 if HIGHMEM64G
> +	default 10 if 64BIT
> +	default 7
> +	depends on CGROUP_MEM_RES_CTLR
> +	help
> +	  By making this value to be small, wastes in memory usage of page
> +	  accounting can be small. But big number is good for perfomance.
> +	  Especially, HIGHMEM64G users should keep this to be small because
> +	  you tend to have small kernel memory.
> +	  If unsure, use default.
> +
>  config PROC_PID_CPUSET
>  	bool "Include legacy /proc/<pid>/cpuset file"
>  	depends on CPUSETS
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2008-03-19 21:11 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-03-14  9:59 [PATCH 0/7] memcg: radix-tree page_cgroup KAMEZAWA Hiroyuki
2008-03-14 10:03 ` [PATCH 1/7] re-define page_cgroup KAMEZAWA Hiroyuki
2008-03-16 14:15   ` Balbir Singh
2008-03-18  1:10     ` KAMEZAWA Hiroyuki
2008-03-17  0:21   ` Li Zefan
2008-03-18  1:12     ` KAMEZAWA Hiroyuki
2008-03-17  2:07   ` Li Zefan
2008-03-18  1:11     ` KAMEZAWA Hiroyuki
2008-03-14 10:06 ` [PATCH 2/7] charge/uncharge KAMEZAWA Hiroyuki
2008-03-17  1:46   ` Balbir Singh
2008-03-18  1:14     ` KAMEZAWA Hiroyuki
2008-03-17  2:26   ` Li Zefan
2008-03-18  1:15     ` KAMEZAWA Hiroyuki
2008-03-14 10:07 ` [PATCH 3/7] memcg: move_lists KAMEZAWA Hiroyuki
2008-03-18 16:44   ` Balbir Singh
2008-03-19  2:34     ` KAMEZAWA Hiroyuki
2008-03-14 10:15 ` [PATCH 4/7] memcg: page migration KAMEZAWA Hiroyuki
2008-03-17  2:36   ` Li Zefan
2008-03-18  1:17     ` KAMEZAWA Hiroyuki
2008-03-18 18:11   ` Balbir Singh
2008-03-19  2:44     ` KAMEZAWA Hiroyuki
2008-03-14 10:17 ` [PATCH 5/7] radix-tree page cgroup KAMEZAWA Hiroyuki
2008-03-17  2:56   ` Li Zefan
2008-03-17  3:26     ` Li Zefan
2008-03-18  1:18       ` KAMEZAWA Hiroyuki
2008-03-18  1:23     ` KAMEZAWA Hiroyuki
2008-03-19  2:05   ` Balbir Singh
2008-03-19  2:51     ` KAMEZAWA Hiroyuki
2008-03-19  3:14   ` Balbir Singh
2008-03-19  3:24     ` KAMEZAWA Hiroyuki
2008-03-19 21:11   ` Peter Zijlstra [this message]
2008-03-20  4:45     ` KAMEZAWA Hiroyuki
2008-03-20  5:09       ` KAMEZAWA Hiroyuki
2008-03-14 10:18 ` [PATCH 6/7] memcg: speed up by percpu KAMEZAWA Hiroyuki
2008-03-17  3:03   ` Li Zefan
2008-03-18  1:25     ` KAMEZAWA Hiroyuki
2008-03-18 23:55       ` Li Zefan
2008-03-19  2:51         ` KAMEZAWA Hiroyuki
2008-03-19 21:19   ` Peter Zijlstra
2008-03-19 21:41     ` Peter Zijlstra
2008-03-20  9:08       ` Andy Whitcroft
2008-03-20  4:46     ` KAMEZAWA Hiroyuki
2008-03-14 10:22 ` [PATCH 7/7] memcg: freeing page_cgroup at suitable chance KAMEZAWA Hiroyuki
2008-03-17  3:10   ` Li Zefan
2008-03-18  1:30     ` KAMEZAWA Hiroyuki
2008-03-19 21:33   ` Peter Zijlstra
2008-03-20  5:07     ` KAMEZAWA Hiroyuki
2008-03-20  7:55       ` Peter Zijlstra
2008-03-20 14:49         ` kamezawa.hiroyu
2008-03-20 16:04           ` kamezawa.hiroyu
2008-03-20 16:09             ` Peter Zijlstra
2008-03-20 16:15               ` kamezawa.hiroyu
2008-03-15  6:15 ` [PATCH 0/7] memcg: radix-tree page_cgroup Balbir Singh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1205961066.6437.10.camel@lappy \
    --to=a.p.zijlstra@chello.nl \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=hugh@veritas.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-mm@kvack.org \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.