From: bill4carson <bill4carson@gmail.com>
To: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: linux-mm@kvack.org, mgorman@suse.de,
kamezawa.hiroyu@jp.fujitsu.com, dhillf@gmail.com
Subject: Re: [RFC PATCH 5/6] hugetlbfs: Add controller support for private mapping
Date: Fri, 17 Feb 2012 13:22:44 +0800 [thread overview]
Message-ID: <4F3DE424.3010301@gmail.com> (raw)
In-Reply-To: <1328909806-15236-6-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
On 2012a1'02ae??11ae?JPY 05:36, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V"<aneesh.kumar@linux.vnet.ibm.com>
>
> HugeTLB controller is different from a memory controller in that we charge
> controller during mmap() time and not fault time. This make sure userspace
> can fallback to non-hugepage allocation when mmap fails due to controller
> limit.
>
> For private mapping we always charge/uncharge from the current task cgroup.
> Charging happens during mmap(2) and uncharge happens during the
> vm_operations->close when resv_map refcount reaches zero. The uncharge count
> is stored in struct resv_map. For child task after fork the charging happens
> during fault time in alloc_huge_page. We also need to make sure for private
> mapping each vma for hugeTLB mapping have struct resv_map allocated so that we
> can store the uncharge count in resv_map.
>
> Signed-off-by: Aneesh Kumar K.V<aneesh.kumar@linux.vnet.ibm.com>
> ---
> fs/hugetlbfs/hugetlb_cgroup.c | 50 ++++++++++++++++++++++++++++++++
> include/linux/hugetlb.h | 7 ++++
> include/linux/hugetlb_cgroup.h | 16 ++++++++++
> mm/hugetlb.c | 62 ++++++++++++++++++++++++++++++++--------
> 4 files changed, 123 insertions(+), 12 deletions(-)
>
> diff --git a/fs/hugetlbfs/hugetlb_cgroup.c b/fs/hugetlbfs/hugetlb_cgroup.c
> index c478fb0..f828fb2 100644
> --- a/fs/hugetlbfs/hugetlb_cgroup.c
> +++ b/fs/hugetlbfs/hugetlb_cgroup.c
> @@ -458,3 +458,53 @@ long hugetlb_truncate_cgroup_charge(struct hstate *h,
> }
> return chg;
> }
> +
> +int hugetlb_priv_page_charge(struct resv_map *map, struct hstate *h, long chg)
> +{
> + long csize;
> + int idx, ret;
> + struct hugetlb_cgroup *h_cg;
> + struct res_counter *fail_res;
> +
> + /*
> + * Get the task cgroup within rcu_readlock and also
> + * get cgroup reference to make sure cgroup destroy won't
> + * race with page_charge. We don't allow a cgroup destroy
> + * when the cgroup have some charge against it
> + */
> + rcu_read_lock();
> + h_cg = task_hugetlbcgroup(current);
> + css_get(&h_cg->css);
> + rcu_read_unlock();
> +
> + if (hugetlb_cgroup_is_root(h_cg)) {
> + ret = chg;
> + goto err_out;
> + }
> +
> + csize = chg * huge_page_size(h);
> + idx = h - hstates;
> + ret = res_counter_charge(&h_cg->memhuge[idx], csize,&fail_res);
> + if (!ret) {
> + map->nr_pages[idx] += chg<< huge_page_order(h);
> + ret = chg;
> + }
> +err_out:
> + css_put(&h_cg->css);
> + return ret;
> +}
> +
> +void hugetlb_priv_page_uncharge(struct resv_map *map, int idx, int nr_pages)
> +{
> + struct hugetlb_cgroup *h_cg;
> + unsigned long csize = nr_pages * PAGE_SIZE;
> +
> + rcu_read_lock();
> + h_cg = task_hugetlbcgroup(current);
> + if (!hugetlb_cgroup_is_root(h_cg)) {
> + res_counter_uncharge(&h_cg->memhuge[idx], csize);
> + map->nr_pages[idx] -= nr_pages;
> + }
> + rcu_read_unlock();
> + return;
> +}
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 4392b6a..e2ba381 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -233,6 +233,12 @@ struct hstate {
> char name[HSTATE_NAME_LEN];
> };
>
> +struct resv_map {
> + struct kref refs;
> + int nr_pages[HUGE_MAX_HSTATE];
> + struct list_head regions;
> +};
> +
Please put resv_map after HUGE_MAX_HSTATE definition,
otherwise it will break on non-x86 arches, which has no
HUGE_MAX_HSTATE definition.
#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif
+struct resv_map {
+ struct kref refs;
+ int nr_pages[HUGE_MAX_HSTATE];
+ struct list_head regions;
+};
> struct huge_bootmem_page {
> struct list_head list;
> struct hstate *hstate;
> @@ -323,6 +329,7 @@ static inline unsigned hstate_index_to_shift(unsigned index)
>
> #else
> struct hstate {};
> +struct resv_map {};
> #define alloc_huge_page_node(h, nid) NULL
> #define alloc_bootmem_huge_page(h) NULL
> #define hstate_file(f) NULL
> diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
> index 3131d62..c3738df 100644
> --- a/include/linux/hugetlb_cgroup.h
> +++ b/include/linux/hugetlb_cgroup.h
> @@ -32,6 +32,10 @@ extern void hugetlb_page_uncharge(struct list_head *head,
> extern void hugetlb_commit_page_charge(struct list_head *head, long f, long t);
> extern long hugetlb_truncate_cgroup_charge(struct hstate *h,
> struct list_head *head, long from);
> +extern int hugetlb_priv_page_charge(struct resv_map *map,
> + struct hstate *h, long chg);
> +extern void hugetlb_priv_page_uncharge(struct resv_map *map,
> + int idx, int nr_pages);
> #else
> static inline long hugetlb_page_charge(struct list_head *head,
> struct hstate *h, long f, long t)
> @@ -58,5 +62,17 @@ static inline long hugetlb_truncate_cgroup_charge(struct hstate *h,
> {
> return region_truncate(head, from);
> }
> +
> +static inline int hugetlb_priv_page_charge(struct resv_map *map,
> + struct hstate *h, long chg)
> +{
> + return chg;
> +}
> +
> +static inline void hugetlb_priv_page_uncharge(struct resv_map *map,
> + int idx, int nr_pages)
> +{
> + return;
> +}
> #endif /* CONFIG_CGROUP_HUGETLB_RES_CTLR */
> #endif
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 102410f..5a91838 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -303,14 +303,9 @@ static void set_vma_private_data(struct vm_area_struct *vma,
> vma->vm_private_data = (void *)value;
> }
>
> -struct resv_map {
> - struct kref refs;
> - struct list_head regions;
> -};
> -
> static struct resv_map *resv_map_alloc(void)
> {
> - struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
> + struct resv_map *resv_map = kzalloc(sizeof(*resv_map), GFP_KERNEL);
> if (!resv_map)
> return NULL;
>
> @@ -322,10 +317,16 @@ static struct resv_map *resv_map_alloc(void)
>
> static void resv_map_release(struct kref *ref)
> {
> + int idx;
> struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
>
> /* Clear out any active regions before we release the map. */
> region_truncate(&resv_map->regions, 0);
> + /* drop the hugetlb cgroup charge */
> + for (idx = 0; idx< HUGE_MAX_HSTATE; idx++) {
> + hugetlb_priv_page_uncharge(resv_map, idx,
> + resv_map->nr_pages[idx]);
> + }
> kfree(resv_map);
> }
>
> @@ -989,9 +990,20 @@ static long vma_needs_reservation(struct hstate *h,
> return hugetlb_page_charge(&inode->i_mapping->private_list,
> h, idx, idx + 1);
> } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
> - return 1;
> -
> + struct resv_map *resv_map = vma_resv_map(vma);
> + if (!resv_map) {
> + /*
> + * We didn't allocate resv_map for this vma.
> + * Allocate it here.
> + */
> + resv_map = resv_map_alloc();
> + if (!resv_map)
> + return -ENOMEM;
> + set_vma_resv_map(vma, resv_map);
> + }
> + return hugetlb_priv_page_charge(resv_map, h, 1);
> } else {
> + /* We did the priv page charging in mmap call */
> long err;
> pgoff_t idx = vma_hugecache_offset(h, vma, addr);
> struct resv_map *reservations = vma_resv_map(vma);
> @@ -1007,14 +1019,20 @@ static void vma_uncharge_reservation(struct hstate *h,
> struct vm_area_struct *vma,
> unsigned long chg)
> {
> + int idx = h - hstates;
> struct address_space *mapping = vma->vm_file->f_mapping;
> struct inode *inode = mapping->host;
>
>
> if (vma->vm_flags& VM_MAYSHARE) {
> return hugetlb_page_uncharge(&inode->i_mapping->private_list,
> - h - hstates,
> - chg<< huge_page_order(h));
> + idx, chg<< huge_page_order(h));
> + } else {
> + struct resv_map *resv_map = vma_resv_map(vma);
> +
> + return hugetlb_priv_page_uncharge(resv_map,
> + idx,
> + chg<< huge_page_order(h));
> }
> }
>
> @@ -2165,6 +2183,22 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
> */
> if (reservations)
> kref_get(&reservations->refs);
> + else if (!(vma->vm_flags& VM_MAYSHARE)) {
> + /*
> + * for non shared vma we need resv map to track
> + * hugetlb cgroup usage. Allocate it here. Charging
> + * the cgroup will take place in fault path.
> + */
> + struct resv_map *resv_map = resv_map_alloc();
> + /*
> + * If we fail to allocate resv_map here. We will allocate
> + * one when we do alloc_huge_page. So we don't handle
> + * ENOMEM here. The function also return void. So there is
> + * nothing much we can do.
> + */
> + if (resv_map)
> + set_vma_resv_map(vma, resv_map);
> + }
> }
>
> static void hugetlb_vm_op_close(struct vm_area_struct *vma)
> @@ -2968,7 +3002,7 @@ int hugetlb_reserve_pages(struct inode *inode,
> {
> long ret, chg;
> struct hstate *h = hstate_inode(inode);
> -
> + struct resv_map *resv_map = NULL;
> /*
> * Only apply hugepage reservation if asked. At fault time, an
> * attempt will be made for VM_NORESERVE to allocate a page
> @@ -2987,7 +3021,7 @@ int hugetlb_reserve_pages(struct inode *inode,
> chg = hugetlb_page_charge(&inode->i_mapping->private_list,
> h, from, to);
> } else {
> - struct resv_map *resv_map = resv_map_alloc();
> + resv_map = resv_map_alloc();
> if (!resv_map)
> return -ENOMEM;
>
> @@ -2995,6 +3029,7 @@ int hugetlb_reserve_pages(struct inode *inode,
>
> set_vma_resv_map(vma, resv_map);
> set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
> + chg = hugetlb_priv_page_charge(resv_map, h, chg);
> }
>
> if (chg< 0)
> @@ -3033,6 +3068,9 @@ err_quota:
> if (!vma || vma->vm_flags& VM_MAYSHARE)
> hugetlb_page_uncharge(&inode->i_mapping->private_list,
> h - hstates, chg<< huge_page_order(h));
> + else
> + hugetlb_priv_page_uncharge(resv_map, h - hstates,
> + chg<< huge_page_order(h));
> return ret;
>
> }
--
I am a slow learner
but I will keep trying to fight for my dreams!
--bill
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2012-02-17 5:20 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-02-10 21:36 [RFC PATCH 0/6] hugetlbfs: Add cgroup resource controller for hugetlbfs Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 1/6] hugetlb: Add a new hugetlb cgroup Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 2/6] hugetlbfs: Add usage and max usage files to " Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 3/6] hugetlbfs: Add new region handling functions Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 4/6] hugetlbfs: Add controller support for shared mapping Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 5/6] hugetlbfs: Add controller support for private mapping Aneesh Kumar K.V
2012-02-17 5:22 ` bill4carson [this message]
2012-02-17 8:05 ` Aneesh Kumar K.V
2012-02-10 21:36 ` [RFC PATCH 6/6] hugetlbfs: Switch to new region APIs Aneesh Kumar K.V
2012-02-11 12:37 ` [RFC PATCH 0/6] hugetlbfs: Add cgroup resource controller for hugetlbfs Hillf Danton
2012-02-12 17:44 ` Aneesh Kumar K.V
2012-02-14 6:58 ` KAMEZAWA Hiroyuki
2012-02-17 8:00 ` Aneesh Kumar K.V
2012-02-17 8:03 ` Aneesh Kumar K.V
2012-02-17 8:03 ` Aneesh Kumar K.V
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4F3DE424.3010301@gmail.com \
--to=bill4carson@gmail.com \
--cc=aneesh.kumar@linux.vnet.ibm.com \
--cc=dhillf@gmail.com \
--cc=kamezawa.hiroyu@jp.fujitsu.com \
--cc=linux-mm@kvack.org \
--cc=mgorman@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.