From: bill4carson <bill4carson@gmail.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: linux-mm@kvack.org, mgorman@suse.de,
kamezawa.hiroyu@jp.fujitsu.com, dhillf@gmail.com
Subject: Re: [RFC PATCH 5/6] hugetlbfs: Add controller support for private mapping
Date: Fri, 17 Feb 2012 13:22:44 +0800 [thread overview]
Message-ID: <4F3DE424.3010301@gmail.com> (raw)
In-Reply-To: <1328909806-15236-6-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
On 2012a1'02ae??11ae?JPY 05:36, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V"<aneesh.kumar@linux.vnet.ibm.com>
>
> HugeTLB controller is different from a memory controller in that we charge
> controller during mmap() time and not fault time. This make sure userspace
> can fallback to non-hugepage allocation when mmap fails due to controller
> limit.
>
> For private mapping we always charge/uncharge from the current task cgroup.
> Charging happens during mmap(2) and uncharge happens during the
> vm_operations->close when resv_map refcount reaches zero. The uncharge count
> is stored in struct resv_map. For child task after fork the charging happens
> during fault time in alloc_huge_page. We also need to make sure for private
> mapping each vma for hugeTLB mapping have struct resv_map allocated so that we
> can store the uncharge count in resv_map.
>
> Signed-off-by: Aneesh Kumar K.V<aneesh.kumar@linux.vnet.ibm.com>
> ---
> fs/hugetlbfs/hugetlb_cgroup.c | 50 ++++++++++++++++++++++++++++++++
> include/linux/hugetlb.h | 7 ++++
> include/linux/hugetlb_cgroup.h | 16 ++++++++++
> mm/hugetlb.c | 62 ++++++++++++++++++++++++++++++++--------
> 4 files changed, 123 insertions(+), 12 deletions(-)
>
> diff --git a/fs/hugetlbfs/hugetlb_cgroup.c b/fs/hugetlbfs/hugetlb_cgroup.c
> index c478fb0..f828fb2 100644
> --- a/fs/hugetlbfs/hugetlb_cgroup.c
> +++ b/fs/hugetlbfs/hugetlb_cgroup.c
> @@ -458,3 +458,53 @@ long hugetlb_truncate_cgroup_charge(struct hstate *h,
> }
> return chg;
> }
> +
> +int hugetlb_priv_page_charge(struct resv_map *map, struct hstate *h, long chg)
> +{
> + long csize;
> + int idx, ret;
> + struct hugetlb_cgroup *h_cg;
> + struct res_counter *fail_res;
> +
> + /*
> + * Get the task cgroup within rcu_readlock and also
> + * get cgroup reference to make sure cgroup destroy won't
> + * race with page_charge. We don't allow a cgroup destroy
> + * when the cgroup have some charge against it
> + */
> + rcu_read_lock();
> + h_cg = task_hugetlbcgroup(current);
> + css_get(&h_cg->css);
> + rcu_read_unlock();
> +
> + if (hugetlb_cgroup_is_root(h_cg)) {
> + ret = chg;
> + goto err_out;
> + }
> +
> + csize = chg * huge_page_size(h);
> + idx = h - hstates;
> + ret = res_counter_charge(&h_cg->memhuge[idx], csize,&fail_res);
> + if (!ret) {
> + map->nr_pages[idx] += chg<< huge_page_order(h);
> + ret = chg;
> + }
> +err_out:
> + css_put(&h_cg->css);
> + return ret;
> +}
> +
> +void hugetlb_priv_page_uncharge(struct resv_map *map, int idx, int nr_pages)
> +{
> + struct hugetlb_cgroup *h_cg;
> + unsigned long csize = nr_pages * PAGE_SIZE;
> +
> + rcu_read_lock();
> + h_cg = task_hugetlbcgroup(current);
> + if (!hugetlb_cgroup_is_root(h_cg)) {
> + res_counter_uncharge(&h_cg->memhuge[idx], csize);
> + map->nr_pages[idx] -= nr_pages;
> + }
> + rcu_read_unlock();
> + return;
> +}
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 4392b6a..e2ba381 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -233,6 +233,12 @@ struct hstate {
> char name[HSTATE_NAME_LEN];
> };
>
> +struct resv_map {
> + struct kref refs;
> + int nr_pages[HUGE_MAX_HSTATE];
> + struct list_head regions;
> +};
> +
Please put resv_map after HUGE_MAX_HSTATE definition,
otherwise it will break on non-x86 arches, which has no
HUGE_MAX_HSTATE definition.
#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif
+struct resv_map {
+ struct kref refs;
+ int nr_pages[HUGE_MAX_HSTATE];
+ struct list_head regions;
+};
> struct huge_bootmem_page {
> struct list_head list;
> struct hstate *hstate;
> @@ -323,6 +329,7 @@ static inline unsigned hstate_index_to_shift(unsigned index)
>
> #else
> struct hstate {};
> +struct resv_map {};
> #define alloc_huge_page_node(h, nid) NULL
> #define alloc_bootmem_huge_page(h) NULL
> #define hstate_file(f) NULL
> diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> index 3131d62..c3738df 100644
> --- a/include/linux/hugetlb_cgroup.h
> +++ b/include/linux/hugetlb_cgroup.h
> @@ -32,6 +32,10 @@ extern void hugetlb_page_uncharge(struct list_head *head,
> extern void hugetlb_commit_page_charge(struct list_head *head, long f, long t);
> extern long hugetlb_truncate_cgroup_charge(struct hstate *h,
> struct list_head *head, long from);
> +extern int hugetlb_priv_page_charge(struct resv_map *map,
> + struct hstate *h, long chg);
> +extern void hugetlb_priv_page_uncharge(struct resv_map *map,
> + int idx, int nr_pages);
> #else
> static inline long hugetlb_page_charge(struct list_head *head,
> struct hstate *h, long f, long t)
> @@ -58,5 +62,17 @@ static inline long hugetlb_truncate_cgroup_charge(struct hstate *h,
> {
> return region_truncate(head, from);
> }
> +
> +static inline int hugetlb_priv_page_charge(struct resv_map *map,
> + struct hstate *h, long chg)
> +{
> + return chg;
> +}
> +
> +static inline void hugetlb_priv_page_uncharge(struct resv_map *map,
> + int idx, int nr_pages)
> +{
> + return;
> +}
> #endif /* CONFIG_CGROUP_HUGETLB_RES_CTLR */
> #endif
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 102410f..5a91838 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -303,14 +303,9 @@ static void set_vma_private_data(struct vm_area_struct *vma,
> vma->vm_private_data = (void *)value;
> }
>
> -struct resv_map {
> - struct kref refs;
> - struct list_head regions;
> -};
> -
> static struct resv_map *resv_map_alloc(void)
> {
> - struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
> + struct resv_map *resv_map = kzalloc(sizeof(*resv_map), GFP_KERNEL);
> if (!resv_map)
> return NULL;
>
> @@ -322,10 +317,16 @@ static struct resv_map *resv_map_alloc(void)
>
> static void resv_map_release(struct kref *ref)
> {
> + int idx;
> struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
>
> /* Clear out any active regions before we release the map. */
> region_truncate(&resv_map->regions, 0);
> + /* drop the hugetlb cgroup charge */
> + for (idx = 0; idx< HUGE_MAX_HSTATE; idx++) {
> + hugetlb_priv_page_uncharge(resv_map, idx,
> + resv_map->nr_pages[idx]);
> + }
> kfree(resv_map);
> }
>
> @@ -989,9 +990,20 @@ static long vma_needs_reservation(struct hstate *h,
> return hugetlb_page_charge(&inode->i_mapping->private_list,
> h, idx, idx + 1);
> } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
> - return 1;
> -
> + struct resv_map *resv_map = vma_resv_map(vma);
> + if (!resv_map) {
> + /*
> + * We didn't allocate resv_map for this vma.
> + * Allocate it here.
> + */
> + resv_map = resv_map_alloc();
> + if (!resv_map)
> + return -ENOMEM;
> + set_vma_resv_map(vma, resv_map);
> + }
> + return hugetlb_priv_page_charge(resv_map, h, 1);
> } else {
> + /* We did the priv page charging in mmap call */
> long err;
> pgoff_t idx = vma_hugecache_offset(h, vma, addr);
> struct resv_map *reservations = vma_resv_map(vma);
> @@ -1007,14 +1019,20 @@ static void vma_uncharge_reservation(struct hstate *h,
> struct vm_area_struct *vma,
> unsigned long chg)
> {
> + int idx = h - hstates;
> struct address_space *mapping = vma->vm_file->f_mapping;
> struct inode *inode = mapping->host;
>
>
> if (vma->vm_flags& VM_MAYSHARE) {
> return hugetlb_page_uncharge(&inode->i_mapping->private_list,
> - h - hstates,
> - chg<< huge_page_order(h));
> + idx, chg<< huge_page_order(h));
> + } else {
> + struct resv_map *resv_map = vma_resv_map(vma);
> +
> + return hugetlb_priv_page_uncharge(resv_map,
> + idx,
> + chg<< huge_page_order(h));
> }
> }
>
> @@ -2165,6 +2183,22 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
> */
> if (reservations)
> kref_get(&reservations->refs);
> + else if (!(vma->vm_flags& VM_MAYSHARE)) {
> + /*
> + * for non shared vma we need resv map to track
> + * hugetlb cgroup usage. Allocate it here. Charging
> + * the cgroup will take place in fault path.
> + */
> + struct resv_map *resv_map = resv_map_alloc();
> + /*
> + * If we fail to allocate resv_map here. We will allocate
> + * one when we do alloc_huge_page. So we don't handle
> + * ENOMEM here. The function also return void. So there is
> + * nothing much we can do.
> + */
> + if (resv_map)
> + set_vma_resv_map(vma, resv_map);
> + }
> }
>
> static void hugetlb_vm_op_close(struct vm_area_struct *vma)
> @@ -2968,7 +3002,7 @@ int hugetlb_reserve_pages(struct inode *inode,
> {
> long ret, chg;
> struct hstate *h = hstate_inode(inode);
> -
> + struct resv_map *resv_map = NULL;
> /*
> * Only apply hugepage reservation if asked. At fault time, an
> * attempt will be made for VM_NORESERVE to allocate a page
> @@ -2987,7 +3021,7 @@ int hugetlb_reserve_pages(struct inode *inode,
> chg = hugetlb_page_charge(&inode->i_mapping->private_list,
> h, from, to);
> } else {
> - struct resv_map *resv_map = resv_map_alloc();
> + resv_map = resv_map_alloc();
> if (!resv_map)
> return -ENOMEM;
>
> @@ -2995,6 +3029,7 @@ int hugetlb_reserve_pages(struct inode *inode,
>
> set_vma_resv_map(vma, resv_map);
> set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
> + chg = hugetlb_priv_page_charge(resv_map, h, chg);
> }
>
> if (chg< 0)
> @@ -3033,6 +3068,9 @@ err_quota:
> if (!vma || vma->vm_flags& VM_MAYSHARE)
> hugetlb_page_uncharge(&inode->i_mapping->private_list,
> h - hstates, chg<< huge_page_order(h));
> + else
> + hugetlb_priv_page_uncharge(resv_map, h - hstates,
> + chg<< huge_page_order(h));
> return ret;
>
> }
--
I am a slow learner
but I will keep trying to fight for my dreams!
--bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2012-02-17 5:20 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-02-10 21:36 [RFC PATCH 0/6] hugetlbfs: Add cgroup resource controller for hugetlbfs Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 1/6] hugetlb: Add a new hugetlb cgroup Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 2/6] hugetlbfs: Add usage and max usage files to " Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 3/6] hugetlbfs: Add new region handling functions Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 4/6] hugetlbfs: Add controller support for shared mapping Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 5/6] hugetlbfs: Add controller support for private mapping Aneesh Kumar K.V
2012-02-17 5:22 ` bill4carson [this message]
2012-02-17 8:05 ` Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 6/6] hugetlbfs: Switch to new region APIs Aneesh Kumar K.V
2012-02-11 12:37 ` [RFC PATCH 0/6] hugetlbfs: Add cgroup resource controller for hugetlbfs Hillf Danton
2012-02-12 17:44 ` Aneesh Kumar K.V
2012-02-14 6:58 ` KAMEZAWA Hiroyuki
2012-02-17 8:00 ` Aneesh Kumar K.V
2012-02-17 8:03 ` Aneesh Kumar K.V
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4F3DE424.3010301@gmail.com \
--to=bill4carson@gmail.com \
--cc=aneesh.kumar@linux.vnet.ibm.com \
--cc=dhillf@gmail.com \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-mm@kvack.org \
--cc=mgorman@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).