All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
To: Hillf Danton <dhillf@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, mgorman@suse.de,
	kamezawa.hiroyu@jp.fujitsu.com, aarcange@redhat.com,
	mhocko@suse.cz, hannes@cmpxchg.org, linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org,
	David Gibson <david@gibson.dropbear.id.au>
Subject: Re: [PATCH -V2 0/9] memcg: add HugeTLB resource tracking
Date: Tue, 06 Mar 2012 19:35:56 +0530	[thread overview]
Message-ID: <87hay1akor.fsf@linux.vnet.ibm.com> (raw)
In-Reply-To: <CAJd=RBAJxVs0Jz+=PNO222oDvF0n6+hh7FNuFpSYTS3EJL8fpw@mail.gmail.com>

On Mon, 5 Mar 2012 21:56:50 +0800, Hillf Danton <dhillf@gmail.com> wrote:
> On Mon, Mar 5, 2012 at 3:15 AM, Aneesh Kumar K.V
> <aneesh.kumar@linux.vnet.ibm.com> wrote:
> > On Thu, 1 Mar 2012 14:40:29 -0800, Andrew Morton <akpm@linux-foundation.org> wrote:
> >> I haven't begin to get my head around this yet, but I'd like to draw
> >> your attention to https://lkml.org/lkml/2012/2/15/548.
> >
> > Hmm that's really serious bug.
> >
> >>  That fix has
> >> been hanging around for a while, but I haven't done anything with it
> >> yet because I don't like its additional blurring of the separation
> >> between hugetlb core code and hugetlbfs.  I want to find time to sit
> >> down and see if the fix can be better architected but haven't got
> >> around to that yet.
> >>
> >> I expect that your patches will conflict at least mechanically with
> >> David's, which is not a big issue.  But I wonder whether your patches
> >> will copy the same bug into other places, and whether you can think of
> >> a tidier way of addressing the bug which David is seeing?
> >>
> >
> > I will go through the implementation again and make sure the problem
> > explained by David doesn't happen in the new code path added by the
> > patch series.
> >
> Hi Aneesh
> 
> When you tackle that problem, please take the following approach also
> into account, though it is a draft, in which quota handback is simply
> eliminated when huge page is freed, if that problem is caused by extra
> reference count.
> And get_quota is carefully paired with put_quota for newly allocated
> page. That is all, and feel free to correct me.

But we should not do put quota until the page is added back to the free pool
right ? otherwise quota subsystem (the actual hugetlb pool) will
indicate availability where as the file system won't have any free
pages. 

-aneesh

> 
> Best Regards
> -hd
> 
> --- a/mm/hugetlb.c	Mon Mar  5 20:20:34 2012
> +++ b/mm/hugetlb.c	Mon Mar  5 21:20:14 2012
> @@ -533,9 +533,7 @@ static void free_huge_page(struct page *
>  	 */
>  	struct hstate *h = page_hstate(page);
>  	int nid = page_to_nid(page);
> -	struct address_space *mapping;
> 
> -	mapping = (struct address_space *) page_private(page);
>  	set_page_private(page, 0);
>  	page->mapping = NULL;
>  	BUG_ON(page_count(page));
> @@ -551,8 +549,6 @@ static void free_huge_page(struct page *
>  		enqueue_huge_page(h, page);
>  	}
>  	spin_unlock(&hugetlb_lock);
> -	if (mapping)
> -		hugetlb_put_quota(mapping, 1);
>  }
> 
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> @@ -1021,7 +1017,8 @@ static void vma_commit_reservation(struc
>  }
> 
>  static struct page *alloc_huge_page(struct vm_area_struct *vma,
> -				    unsigned long addr, int avoid_reserve)
> +				    unsigned long addr, int avoid_reserve,
> +				    long *quota)
>  {
>  	struct hstate *h = hstate_vma(vma);
>  	struct page *page;
> @@ -1050,7 +1047,8 @@ static struct page *alloc_huge_page(stru
>  	if (!page) {
>  		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
>  		if (!page) {
> -			hugetlb_put_quota(inode->i_mapping, chg);
> +			if (chg)
> +				hugetlb_put_quota(inode->i_mapping, chg);
>  			return ERR_PTR(-VM_FAULT_SIGBUS);
>  		}
>  	}
> @@ -1058,6 +1056,8 @@ static struct page *alloc_huge_page(stru
>  	set_page_private(page, (unsigned long) mapping);
> 
>  	vma_commit_reservation(h, vma, addr);
> +	if (quota)
> +		*quota = chg;
> 
>  	return page;
>  }
> @@ -2365,6 +2365,7 @@ static int hugetlb_cow(struct mm_struct
>  	struct page *old_page, *new_page;
>  	int avoidcopy;
>  	int outside_reserve = 0;
> +	long quota = 0;
> 
>  	old_page = pte_page(pte);
> 
> @@ -2397,7 +2398,8 @@ retry_avoidcopy:
> 
>  	/* Drop page_table_lock as buddy allocator may be called */
>  	spin_unlock(&mm->page_table_lock);
> -	new_page = alloc_huge_page(vma, address, outside_reserve);
> +	quota = 0;
> +	new_page = alloc_huge_page(vma, address, outside_reserve, &quota);
> 
>  	if (IS_ERR(new_page)) {
>  		page_cache_release(old_page);
> @@ -2439,6 +2441,8 @@ retry_avoidcopy:
>  	if (unlikely(anon_vma_prepare(vma))) {
>  		page_cache_release(new_page);
>  		page_cache_release(old_page);
> +		if (quota)
> +			hugetlb_put_quota(vma->vm_file->f_mapping, quota);
>  		/* Caller expects lock to be held */
>  		spin_lock(&mm->page_table_lock);
>  		return VM_FAULT_OOM;
> @@ -2470,6 +2474,8 @@ retry_avoidcopy:
>  			address & huge_page_mask(h),
>  			(address & huge_page_mask(h)) + huge_page_size(h));
>  	}
> +	else if (quota)
> +		hugetlb_put_quota(vma->vm_file->f_mapping, quota);
>  	page_cache_release(new_page);
>  	page_cache_release(old_page);
>  	return 0;
> @@ -2519,6 +2525,7 @@ static int hugetlb_no_page(struct mm_str
>  	struct page *page;
>  	struct address_space *mapping;
>  	pte_t new_pte;
> +	long quota = 0;
> 
>  	/*
>  	 * Currently, we are forced to kill the process in the event the
> @@ -2540,12 +2547,13 @@ static int hugetlb_no_page(struct mm_str
>  	 * before we get page_table_lock.
>  	 */
>  retry:
> +	quota = 0;
>  	page = find_lock_page(mapping, idx);
>  	if (!page) {
>  		size = i_size_read(mapping->host) >> huge_page_shift(h);
>  		if (idx >= size)
>  			goto out;
> -		page = alloc_huge_page(vma, address, 0);
> +		page = alloc_huge_page(vma, address, 0, &quota);
>  		if (IS_ERR(page)) {
>  			ret = -PTR_ERR(page);
>  			goto out;
> @@ -2560,6 +2568,8 @@ retry:
>  			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
>  			if (err) {
>  				put_page(page);
> +				if (quota)
> +					hugetlb_put_quota(mapping, quota);
>  				if (err == -EEXIST)
>  					goto retry;
>  				goto out;
> @@ -2633,6 +2643,8 @@ backout:
>  backout_unlocked:
>  	unlock_page(page);
>  	put_page(page);
> +	if (quota)
> +		hugetlb_put_quota(mapping, quota);
>  	goto out;
>  }
> 
> --
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
To: Hillf Danton <dhillf@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, mgorman@suse.de,
	kamezawa.hiroyu@jp.fujitsu.com, aarcange@redhat.com,
	mhocko@suse.cz, hannes@cmpxchg.org, linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org,
	David Gibson <david@gibson.dropbear.id.au>
Subject: Re: [PATCH -V2 0/9] memcg: add HugeTLB resource tracking
Date: Tue, 06 Mar 2012 19:35:56 +0530	[thread overview]
Message-ID: <87hay1akor.fsf@linux.vnet.ibm.com> (raw)
In-Reply-To: <CAJd=RBAJxVs0Jz+=PNO222oDvF0n6+hh7FNuFpSYTS3EJL8fpw@mail.gmail.com>

On Mon, 5 Mar 2012 21:56:50 +0800, Hillf Danton <dhillf@gmail.com> wrote:
> On Mon, Mar 5, 2012 at 3:15 AM, Aneesh Kumar K.V
> <aneesh.kumar@linux.vnet.ibm.com> wrote:
> > On Thu, 1 Mar 2012 14:40:29 -0800, Andrew Morton <akpm@linux-foundation.org> wrote:
> >> I haven't begin to get my head around this yet, but I'd like to draw
> >> your attention to https://lkml.org/lkml/2012/2/15/548.
> >
> > Hmm that's really serious bug.
> >
> >>  That fix has
> >> been hanging around for a while, but I haven't done anything with it
> >> yet because I don't like its additional blurring of the separation
> >> between hugetlb core code and hugetlbfs.  I want to find time to sit
> >> down and see if the fix can be better architected but haven't got
> >> around to that yet.
> >>
> >> I expect that your patches will conflict at least mechanically with
> >> David's, which is not a big issue.  But I wonder whether your patches
> >> will copy the same bug into other places, and whether you can think of
> >> a tidier way of addressing the bug which David is seeing?
> >>
> >
> > I will go through the implementation again and make sure the problem
> > explained by David doesn't happen in the new code path added by the
> > patch series.
> >
> Hi Aneesh
> 
> When you tackle that problem, please take the following approach also
> into account, though it is a draft, in which quota handback is simply
> eliminated when huge page is freed, if that problem is caused by extra
> reference count.
> And get_quota is carefully paired with put_quota for newly allocated
> page. That is all, and feel free to correct me.

But we should not do put quota until the page is added back to the free pool
right ? otherwise quota subsystem (the actual hugetlb pool) will
indicate availability where as the file system won't have any free
pages. 

-aneesh

> 
> Best Regards
> -hd
> 
> --- a/mm/hugetlb.c	Mon Mar  5 20:20:34 2012
> +++ b/mm/hugetlb.c	Mon Mar  5 21:20:14 2012
> @@ -533,9 +533,7 @@ static void free_huge_page(struct page *
>  	 */
>  	struct hstate *h = page_hstate(page);
>  	int nid = page_to_nid(page);
> -	struct address_space *mapping;
> 
> -	mapping = (struct address_space *) page_private(page);
>  	set_page_private(page, 0);
>  	page->mapping = NULL;
>  	BUG_ON(page_count(page));
> @@ -551,8 +549,6 @@ static void free_huge_page(struct page *
>  		enqueue_huge_page(h, page);
>  	}
>  	spin_unlock(&hugetlb_lock);
> -	if (mapping)
> -		hugetlb_put_quota(mapping, 1);
>  }
> 
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> @@ -1021,7 +1017,8 @@ static void vma_commit_reservation(struc
>  }
> 
>  static struct page *alloc_huge_page(struct vm_area_struct *vma,
> -				    unsigned long addr, int avoid_reserve)
> +				    unsigned long addr, int avoid_reserve,
> +				    long *quota)
>  {
>  	struct hstate *h = hstate_vma(vma);
>  	struct page *page;
> @@ -1050,7 +1047,8 @@ static struct page *alloc_huge_page(stru
>  	if (!page) {
>  		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
>  		if (!page) {
> -			hugetlb_put_quota(inode->i_mapping, chg);
> +			if (chg)
> +				hugetlb_put_quota(inode->i_mapping, chg);
>  			return ERR_PTR(-VM_FAULT_SIGBUS);
>  		}
>  	}
> @@ -1058,6 +1056,8 @@ static struct page *alloc_huge_page(stru
>  	set_page_private(page, (unsigned long) mapping);
> 
>  	vma_commit_reservation(h, vma, addr);
> +	if (quota)
> +		*quota = chg;
> 
>  	return page;
>  }
> @@ -2365,6 +2365,7 @@ static int hugetlb_cow(struct mm_struct
>  	struct page *old_page, *new_page;
>  	int avoidcopy;
>  	int outside_reserve = 0;
> +	long quota = 0;
> 
>  	old_page = pte_page(pte);
> 
> @@ -2397,7 +2398,8 @@ retry_avoidcopy:
> 
>  	/* Drop page_table_lock as buddy allocator may be called */
>  	spin_unlock(&mm->page_table_lock);
> -	new_page = alloc_huge_page(vma, address, outside_reserve);
> +	quota = 0;
> +	new_page = alloc_huge_page(vma, address, outside_reserve, &quota);
> 
>  	if (IS_ERR(new_page)) {
>  		page_cache_release(old_page);
> @@ -2439,6 +2441,8 @@ retry_avoidcopy:
>  	if (unlikely(anon_vma_prepare(vma))) {
>  		page_cache_release(new_page);
>  		page_cache_release(old_page);
> +		if (quota)
> +			hugetlb_put_quota(vma->vm_file->f_mapping, quota);
>  		/* Caller expects lock to be held */
>  		spin_lock(&mm->page_table_lock);
>  		return VM_FAULT_OOM;
> @@ -2470,6 +2474,8 @@ retry_avoidcopy:
>  			address & huge_page_mask(h),
>  			(address & huge_page_mask(h)) + huge_page_size(h));
>  	}
> +	else if (quota)
> +		hugetlb_put_quota(vma->vm_file->f_mapping, quota);
>  	page_cache_release(new_page);
>  	page_cache_release(old_page);
>  	return 0;
> @@ -2519,6 +2525,7 @@ static int hugetlb_no_page(struct mm_str
>  	struct page *page;
>  	struct address_space *mapping;
>  	pte_t new_pte;
> +	long quota = 0;
> 
>  	/*
>  	 * Currently, we are forced to kill the process in the event the
> @@ -2540,12 +2547,13 @@ static int hugetlb_no_page(struct mm_str
>  	 * before we get page_table_lock.
>  	 */
>  retry:
> +	quota = 0;
>  	page = find_lock_page(mapping, idx);
>  	if (!page) {
>  		size = i_size_read(mapping->host) >> huge_page_shift(h);
>  		if (idx >= size)
>  			goto out;
> -		page = alloc_huge_page(vma, address, 0);
> +		page = alloc_huge_page(vma, address, 0, &quota);
>  		if (IS_ERR(page)) {
>  			ret = -PTR_ERR(page);
>  			goto out;
> @@ -2560,6 +2568,8 @@ retry:
>  			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
>  			if (err) {
>  				put_page(page);
> +				if (quota)
> +					hugetlb_put_quota(mapping, quota);
>  				if (err == -EEXIST)
>  					goto retry;
>  				goto out;
> @@ -2633,6 +2643,8 @@ backout:
>  backout_unlocked:
>  	unlock_page(page);
>  	put_page(page);
> +	if (quota)
> +		hugetlb_put_quota(mapping, quota);
>  	goto out;
>  }
> 
> --
> 


  reply	other threads:[~2012-03-06 14:05 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-03-01  9:16 [PATCH -V2 0/9] memcg: add HugeTLB resource tracking Aneesh Kumar K.V
2012-03-01  9:16 ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 1/9] mm: move hugetlbfs region tracking function to common code Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
     [not found]   ` <1330593380-1361-2-git-send-email-aneesh.kumar-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2012-03-01 22:33     ` Andrew Morton
2012-03-01 22:33       ` Andrew Morton
2012-03-01 22:33       ` Andrew Morton
     [not found]       ` <20120301143345.7e928efe.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2012-03-04 17:37         ` Aneesh Kumar K.V
2012-03-04 17:37           ` Aneesh Kumar K.V
2012-03-04 17:37           ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 2/9] mm: Update region function to take new data arg Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 3/9] hugetlbfs: Use the generic region API and drop local one Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 4/9] memcg: Add non reclaim resource tracking to memcg Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
2012-03-02  8:38   ` KAMEZAWA Hiroyuki
2012-03-02  8:38     ` KAMEZAWA Hiroyuki
     [not found]     ` <20120302173816.9796f243.kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
2012-03-04 18:07       ` Aneesh Kumar K.V
2012-03-04 18:07         ` Aneesh Kumar K.V
2012-03-04 18:07         ` Aneesh Kumar K.V
     [not found]         ` <87ipikdyud.fsf-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2012-03-08  5:56           ` KAMEZAWA Hiroyuki
2012-03-08  5:56             ` KAMEZAWA Hiroyuki
2012-03-08  5:56             ` KAMEZAWA Hiroyuki
2012-03-08 11:48             ` Aneesh Kumar K.V
2012-03-08 11:48               ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 5/9] hugetlbfs: Add memory controller support for shared mapping Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 6/9] hugetlbfs: Add memory controller support for private mapping Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
     [not found]   ` <1330593380-1361-7-git-send-email-aneesh.kumar-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2012-05-17 23:16     ` Darrick J. Wong
2012-05-17 23:16       ` Darrick J. Wong
2012-05-17 23:16       ` Darrick J. Wong
2012-03-01  9:16 ` [PATCH -V2 7/9] memcg: track resource index in cftype private Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 8/9] hugetlbfs: Add memcg control files for hugetlbfs Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
2012-03-01  9:16 ` [PATCH -V2 9/9] memcg: Add memory controller documentation for hugetlb management Aneesh Kumar K.V
2012-03-01  9:16   ` Aneesh Kumar K.V
2012-03-01 22:40 ` [PATCH -V2 0/9] memcg: add HugeTLB resource tracking Andrew Morton
2012-03-01 22:40   ` Andrew Morton
     [not found]   ` <20120301144029.545a5589.akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2012-03-02  3:28     ` David Gibson
2012-03-02  3:28       ` David Gibson
2012-03-02  3:28       ` David Gibson
2012-03-04 18:09       ` Aneesh Kumar K.V
2012-03-04 18:09         ` Aneesh Kumar K.V
     [not found]         ` <87fwdodyr0.fsf-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2012-03-06  2:38           ` David Gibson
2012-03-06  2:38             ` David Gibson
2012-03-06  2:38             ` David Gibson
2012-03-04 19:15   ` Aneesh Kumar K.V
2012-03-04 19:15     ` Aneesh Kumar K.V
2012-03-05 13:56     ` Hillf Danton
2012-03-05 13:56       ` Hillf Danton
2012-03-06 14:05       ` Aneesh Kumar K.V [this message]
2012-03-06 14:05         ` Aneesh Kumar K.V
     [not found] ` <1330593380-1361-1-git-send-email-aneesh.kumar-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
2012-03-02  5:48   ` KAMEZAWA Hiroyuki
2012-03-02  5:48     ` KAMEZAWA Hiroyuki
2012-03-02  5:48     ` KAMEZAWA Hiroyuki
2012-03-04 18:14     ` Aneesh Kumar K.V
2012-03-04 18:14       ` Aneesh Kumar K.V

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87hay1akor.fsf@linux.vnet.ibm.com \
    --to=aneesh.kumar@linux.vnet.ibm.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=david@gibson.dropbear.id.au \
    --cc=dhillf@gmail.com \
    --cc=hannes@cmpxchg.org \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.