Re: [PATCH v10 7/7] mm: Don't split THP page when syscall is called

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Minchan Kim <minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
To: "Kirill A. Shutemov" <kirill-oKw7cIdHH8eLwutG50LtGA@public.gmane.org>
Cc: Andrew Morton
	<akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org,
	Michael Kerrisk
	<mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,
	Linux API <linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
	Hugh Dickins <hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>,
	Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>,
	Rik van Riel <riel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,
	KOSAKI Motohiro
	<kosaki.motohiro-+CUm20s59erQFUHtdCDX3A@public.gmane.org>,
	Mel Gorman <mgorman-l3A5Bk7waGM@public.gmane.org>,
	Jason Evans <je-b10kYP2dOMg@public.gmane.org>,
	Zhang Yanfei
	<zhangyanfei-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
Subject: Re: [PATCH v10 7/7] mm: Don't split THP page when syscall is called
Date: Tue, 8 Jul 2014 10:30:38 +0900	[thread overview]
Message-ID: <20140708013038.GD6076@bbox> (raw)
In-Reply-To: <20140707111303.GC23150-nhfs4B5ZimeFUdmeq17FyvUpdFzICT1y@public.gmane.org>

On Mon, Jul 07, 2014 at 02:13:03PM +0300, Kirill A. Shutemov wrote:
> On Mon, Jul 07, 2014 at 09:53:58AM +0900, Minchan Kim wrote:
> > We don't need to split THP page when MADV_FREE syscall is
> > called. It could be done when VM decide really frees it so
> > we could reduce the number of THP split.
> > 
> > Signed-off-by: Minchan Kim <minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> > ---
> >  include/linux/huge_mm.h |  3 +++
> >  mm/huge_memory.c        | 25 +++++++++++++++++++++++++
> >  mm/madvise.c            | 19 +++++++++++++++++--
> >  mm/rmap.c               |  4 ++++
> >  mm/vmscan.c             | 24 ++++++++++++++++--------
> >  5 files changed, 65 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 63579cb8d3dc..f0d37238cf8f 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
> >  					  unsigned long addr,
> >  					  pmd_t *pmd,
> >  					  unsigned int flags);
> > +extern int madvise_free_pmd(struct mmu_gather *tlb,
> > +			struct vm_area_struct *vma,
> > +			pmd_t *pmd, unsigned long addr);
> >  extern int zap_huge_pmd(struct mmu_gather *tlb,
> >  			struct vm_area_struct *vma,
> >  			pmd_t *pmd, unsigned long addr);
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 5d562a9fe931..2a70069dcfc0 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1384,6 +1384,31 @@ out:
> >  	return 0;
> >  }
> >  
> > +int madvise_free_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > +		 pmd_t *pmd, unsigned long addr)
> > +{
> > +	spinlock_t *ptl;
> > +	int ret = 0;
> > +
> > +	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
> > +		pmd_t orig_pmd;
> > +		struct mm_struct *mm = vma->vm_mm;
> > +
> > +		/* No hugepage in swapcache */
> > +		VM_BUG_ON(PageSwapCache(pmd_page(orig_pmd)));
> 
> VM_BUG_ON_PAGE() ?

NP.

> 
> > +
> > +		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
> > +		orig_pmd = pmd_mkold(orig_pmd);
> > +		orig_pmd = pmd_mkclean(orig_pmd);
> > +
> > +		set_pmd_at(mm, addr, pmd, orig_pmd);
> > +		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> > +		spin_unlock(ptl);
> > +		ret = 1;
> > +	}
> > +	return ret;
> > +}
> > +
> >  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >  		 pmd_t *pmd, unsigned long addr)
> >  {
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 372a25a8ea82..3c99919ee094 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -320,8 +320,23 @@ static inline unsigned long madvise_free_pmd_range(struct mmu_gather *tlb,
> >  		 * if the range covers.
> >  		 */
> >  		next = pmd_addr_end(addr, end);
> > -		if (pmd_trans_huge(*pmd))
> > -			split_huge_page_pmd(vma, addr, pmd);
> > +		if (pmd_trans_huge(*pmd)) {
> > +			if (next - addr != HPAGE_PMD_SIZE) {
> > +#ifdef CONFIG_DEBUG_VM
> > +				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
> > +					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
> > +						__func__, addr, end,
> > +						vma->vm_start,
> > +						vma->vm_end);
> > +					BUG();
> > +				}
> > +#endif
> > +				split_huge_page_pmd(vma, addr, pmd);
> > +			} else if (madvise_free_pmd(tlb, vma, pmd, addr))
> > +				goto next;
> > +			/* fall through */
> > +		}
> > +
> >  		/*
> >  		 * Here there can be other concurrent MADV_DONTNEED or
> >  		 * trans huge page faults running, and if the pmd is
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index ee495d84c8b3..3c415eb8b6f0 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -702,6 +702,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
> >  		/* go ahead even if the pmd is pmd_trans_splitting() */
> >  		if (pmdp_clear_flush_young_notify(vma, address, pmd))
> >  			referenced++;
> > +
> > +		if (pmd_dirty(*pmd))
> > +			dirty++;
> > +
> >  		spin_unlock(ptl);
> >  	} else {
> >  		pte_t *pte;
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index f7a45600846f..4e15babf4414 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -971,15 +971,23 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> >  		 * Anonymous process memory has backing store?
> >  		 * Try to allocate it some swap space here.
> >  		 */
> > -		if (PageAnon(page) && !PageSwapCache(page) && !freeable) {
> > -			if (!(sc->gfp_mask & __GFP_IO))
> > -				goto keep_locked;
> > -			if (!add_to_swap(page, page_list))
> > -				goto activate_locked;
> > -			may_enter_fs = 1;
> > +		if (PageAnon(page) && !PageSwapCache(page)) {
> > +			if (!freeable) {
> > +				if (!(sc->gfp_mask & __GFP_IO))
> > +					goto keep_locked;
> > +				if (!add_to_swap(page, page_list))
> > +					goto activate_locked;
> > +				may_enter_fs = 1;
> >  
> > -			/* Adding to swap updated mapping */
> > -			mapping = page_mapping(page);
> > +				/* Adding to swap updated mapping */
> > +				mapping = page_mapping(page);
> > +			} else {
> > +				if (unlikely(PageTransHuge(page))) {
> > +					if (unlikely(split_huge_page_to_list(
> > +						page, page_list)))
> > +						goto keep_locked;
> 
> Hm. It would be better to free the huge page without splitting. 
> It shouldn't be a big deal: walk over rmap and zap all pmds.
> Or I miss something?

Actually, I did but found no problem except CONFIG_DEBUG_VM but rollback
after peeking [1].
When I read the description in detail by your review, I think we can remove
BUG_ON(PageTransHuge(page)) in try_to_unmap and go with no split for lazyfree
page because they are not in swapcache any more so the assumption of [1] is
not valid. Will do it in next revision.

Thanks for the review, Kirill!

[1] thp: split_huge_page paging, 3f04f62f9


> 
> > +				}
> > +			}
> >  		}
> >  
> >  		/*
> > -- 
> > 2.0.0
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-api" in
> > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> -- 
>  Kirill A. Shutemov
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo-Bw31MaZKKs0EbZ0PF+XxCw@public.gmane.org  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org"> email-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org </a>

-- 
Kind regards,
Minchan Kim

WARNING: multiple messages have this Message-ID (diff)

From: Minchan Kim <minchan@kernel.org>
To: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Linux API <linux-api@vger.kernel.org>,
	Hugh Dickins <hughd@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Mel Gorman <mgorman@suse.de>, Jason Evans <je@fb.com>,
	Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Subject: Re: [PATCH v10 7/7] mm: Don't split THP page when syscall is called
Date: Tue, 8 Jul 2014 10:30:38 +0900	[thread overview]
Message-ID: <20140708013038.GD6076@bbox> (raw)
In-Reply-To: <20140707111303.GC23150@node.dhcp.inet.fi>

On Mon, Jul 07, 2014 at 02:13:03PM +0300, Kirill A. Shutemov wrote:
> On Mon, Jul 07, 2014 at 09:53:58AM +0900, Minchan Kim wrote:
> > We don't need to split THP page when MADV_FREE syscall is
> > called. It could be done when VM decide really frees it so
> > we could reduce the number of THP split.
> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  include/linux/huge_mm.h |  3 +++
> >  mm/huge_memory.c        | 25 +++++++++++++++++++++++++
> >  mm/madvise.c            | 19 +++++++++++++++++--
> >  mm/rmap.c               |  4 ++++
> >  mm/vmscan.c             | 24 ++++++++++++++++--------
> >  5 files changed, 65 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 63579cb8d3dc..f0d37238cf8f 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
> >  					  unsigned long addr,
> >  					  pmd_t *pmd,
> >  					  unsigned int flags);
> > +extern int madvise_free_pmd(struct mmu_gather *tlb,
> > +			struct vm_area_struct *vma,
> > +			pmd_t *pmd, unsigned long addr);
> >  extern int zap_huge_pmd(struct mmu_gather *tlb,
> >  			struct vm_area_struct *vma,
> >  			pmd_t *pmd, unsigned long addr);
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 5d562a9fe931..2a70069dcfc0 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1384,6 +1384,31 @@ out:
> >  	return 0;
> >  }
> >  
> > +int madvise_free_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > +		 pmd_t *pmd, unsigned long addr)
> > +{
> > +	spinlock_t *ptl;
> > +	int ret = 0;
> > +
> > +	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
> > +		pmd_t orig_pmd;
> > +		struct mm_struct *mm = vma->vm_mm;
> > +
> > +		/* No hugepage in swapcache */
> > +		VM_BUG_ON(PageSwapCache(pmd_page(orig_pmd)));
> 
> VM_BUG_ON_PAGE() ?

NP.

> 
> > +
> > +		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
> > +		orig_pmd = pmd_mkold(orig_pmd);
> > +		orig_pmd = pmd_mkclean(orig_pmd);
> > +
> > +		set_pmd_at(mm, addr, pmd, orig_pmd);
> > +		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> > +		spin_unlock(ptl);
> > +		ret = 1;
> > +	}
> > +	return ret;
> > +}
> > +
> >  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >  		 pmd_t *pmd, unsigned long addr)
> >  {
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 372a25a8ea82..3c99919ee094 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -320,8 +320,23 @@ static inline unsigned long madvise_free_pmd_range(struct mmu_gather *tlb,
> >  		 * if the range covers.
> >  		 */
> >  		next = pmd_addr_end(addr, end);
> > -		if (pmd_trans_huge(*pmd))
> > -			split_huge_page_pmd(vma, addr, pmd);
> > +		if (pmd_trans_huge(*pmd)) {
> > +			if (next - addr != HPAGE_PMD_SIZE) {
> > +#ifdef CONFIG_DEBUG_VM
> > +				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
> > +					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
> > +						__func__, addr, end,
> > +						vma->vm_start,
> > +						vma->vm_end);
> > +					BUG();
> > +				}
> > +#endif
> > +				split_huge_page_pmd(vma, addr, pmd);
> > +			} else if (madvise_free_pmd(tlb, vma, pmd, addr))
> > +				goto next;
> > +			/* fall through */
> > +		}
> > +
> >  		/*
> >  		 * Here there can be other concurrent MADV_DONTNEED or
> >  		 * trans huge page faults running, and if the pmd is
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index ee495d84c8b3..3c415eb8b6f0 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -702,6 +702,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
> >  		/* go ahead even if the pmd is pmd_trans_splitting() */
> >  		if (pmdp_clear_flush_young_notify(vma, address, pmd))
> >  			referenced++;
> > +
> > +		if (pmd_dirty(*pmd))
> > +			dirty++;
> > +
> >  		spin_unlock(ptl);
> >  	} else {
> >  		pte_t *pte;
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index f7a45600846f..4e15babf4414 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -971,15 +971,23 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> >  		 * Anonymous process memory has backing store?
> >  		 * Try to allocate it some swap space here.
> >  		 */
> > -		if (PageAnon(page) && !PageSwapCache(page) && !freeable) {
> > -			if (!(sc->gfp_mask & __GFP_IO))
> > -				goto keep_locked;
> > -			if (!add_to_swap(page, page_list))
> > -				goto activate_locked;
> > -			may_enter_fs = 1;
> > +		if (PageAnon(page) && !PageSwapCache(page)) {
> > +			if (!freeable) {
> > +				if (!(sc->gfp_mask & __GFP_IO))
> > +					goto keep_locked;
> > +				if (!add_to_swap(page, page_list))
> > +					goto activate_locked;
> > +				may_enter_fs = 1;
> >  
> > -			/* Adding to swap updated mapping */
> > -			mapping = page_mapping(page);
> > +				/* Adding to swap updated mapping */
> > +				mapping = page_mapping(page);
> > +			} else {
> > +				if (unlikely(PageTransHuge(page))) {
> > +					if (unlikely(split_huge_page_to_list(
> > +						page, page_list)))
> > +						goto keep_locked;
> 
> Hm. It would be better to free the huge page without splitting. 
> It shouldn't be a big deal: walk over rmap and zap all pmds.
> Or I miss something?

Actually, I did but found no problem except CONFIG_DEBUG_VM but rollback
after peeking [1].
When I read the description in detail by your review, I think we can remove
BUG_ON(PageTransHuge(page)) in try_to_unmap and go with no split for lazyfree
page because they are not in swapcache any more so the assumption of [1] is
not valid. Will do it in next revision.

Thanks for the review, Kirill!

[1] thp: split_huge_page paging, 3f04f62f9


> 
> > +				}
> > +			}
> >  		}
> >  
> >  		/*
> > -- 
> > 2.0.0
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-api" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> -- 
>  Kirill A. Shutemov
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)

From: Minchan Kim <minchan@kernel.org>
To: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Linux API <linux-api@vger.kernel.org>,
	Hugh Dickins <hughd@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Mel Gorman <mgorman@suse.de>, Jason Evans <je@fb.com>,
	Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Subject: Re: [PATCH v10 7/7] mm: Don't split THP page when syscall is called
Date: Tue, 8 Jul 2014 10:30:38 +0900	[thread overview]
Message-ID: <20140708013038.GD6076@bbox> (raw)
In-Reply-To: <20140707111303.GC23150@node.dhcp.inet.fi>

On Mon, Jul 07, 2014 at 02:13:03PM +0300, Kirill A. Shutemov wrote:
> On Mon, Jul 07, 2014 at 09:53:58AM +0900, Minchan Kim wrote:
> > We don't need to split THP page when MADV_FREE syscall is
> > called. It could be done when VM decide really frees it so
> > we could reduce the number of THP split.
> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  include/linux/huge_mm.h |  3 +++
> >  mm/huge_memory.c        | 25 +++++++++++++++++++++++++
> >  mm/madvise.c            | 19 +++++++++++++++++--
> >  mm/rmap.c               |  4 ++++
> >  mm/vmscan.c             | 24 ++++++++++++++++--------
> >  5 files changed, 65 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 63579cb8d3dc..f0d37238cf8f 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
> >  					  unsigned long addr,
> >  					  pmd_t *pmd,
> >  					  unsigned int flags);
> > +extern int madvise_free_pmd(struct mmu_gather *tlb,
> > +			struct vm_area_struct *vma,
> > +			pmd_t *pmd, unsigned long addr);
> >  extern int zap_huge_pmd(struct mmu_gather *tlb,
> >  			struct vm_area_struct *vma,
> >  			pmd_t *pmd, unsigned long addr);
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 5d562a9fe931..2a70069dcfc0 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1384,6 +1384,31 @@ out:
> >  	return 0;
> >  }
> >  
> > +int madvise_free_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > +		 pmd_t *pmd, unsigned long addr)
> > +{
> > +	spinlock_t *ptl;
> > +	int ret = 0;
> > +
> > +	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
> > +		pmd_t orig_pmd;
> > +		struct mm_struct *mm = vma->vm_mm;
> > +
> > +		/* No hugepage in swapcache */
> > +		VM_BUG_ON(PageSwapCache(pmd_page(orig_pmd)));
> 
> VM_BUG_ON_PAGE() ?

NP.

> 
> > +
> > +		orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
> > +		orig_pmd = pmd_mkold(orig_pmd);
> > +		orig_pmd = pmd_mkclean(orig_pmd);
> > +
> > +		set_pmd_at(mm, addr, pmd, orig_pmd);
> > +		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
> > +		spin_unlock(ptl);
> > +		ret = 1;
> > +	}
> > +	return ret;
> > +}
> > +
> >  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >  		 pmd_t *pmd, unsigned long addr)
> >  {
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 372a25a8ea82..3c99919ee094 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -320,8 +320,23 @@ static inline unsigned long madvise_free_pmd_range(struct mmu_gather *tlb,
> >  		 * if the range covers.
> >  		 */
> >  		next = pmd_addr_end(addr, end);
> > -		if (pmd_trans_huge(*pmd))
> > -			split_huge_page_pmd(vma, addr, pmd);
> > +		if (pmd_trans_huge(*pmd)) {
> > +			if (next - addr != HPAGE_PMD_SIZE) {
> > +#ifdef CONFIG_DEBUG_VM
> > +				if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
> > +					pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
> > +						__func__, addr, end,
> > +						vma->vm_start,
> > +						vma->vm_end);
> > +					BUG();
> > +				}
> > +#endif
> > +				split_huge_page_pmd(vma, addr, pmd);
> > +			} else if (madvise_free_pmd(tlb, vma, pmd, addr))
> > +				goto next;
> > +			/* fall through */
> > +		}
> > +
> >  		/*
> >  		 * Here there can be other concurrent MADV_DONTNEED or
> >  		 * trans huge page faults running, and if the pmd is
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index ee495d84c8b3..3c415eb8b6f0 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -702,6 +702,10 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
> >  		/* go ahead even if the pmd is pmd_trans_splitting() */
> >  		if (pmdp_clear_flush_young_notify(vma, address, pmd))
> >  			referenced++;
> > +
> > +		if (pmd_dirty(*pmd))
> > +			dirty++;
> > +
> >  		spin_unlock(ptl);
> >  	} else {
> >  		pte_t *pte;
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index f7a45600846f..4e15babf4414 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -971,15 +971,23 @@ static unsigned long shrink_page_list(struct list_head *page_list,
> >  		 * Anonymous process memory has backing store?
> >  		 * Try to allocate it some swap space here.
> >  		 */
> > -		if (PageAnon(page) && !PageSwapCache(page) && !freeable) {
> > -			if (!(sc->gfp_mask & __GFP_IO))
> > -				goto keep_locked;
> > -			if (!add_to_swap(page, page_list))
> > -				goto activate_locked;
> > -			may_enter_fs = 1;
> > +		if (PageAnon(page) && !PageSwapCache(page)) {
> > +			if (!freeable) {
> > +				if (!(sc->gfp_mask & __GFP_IO))
> > +					goto keep_locked;
> > +				if (!add_to_swap(page, page_list))
> > +					goto activate_locked;
> > +				may_enter_fs = 1;
> >  
> > -			/* Adding to swap updated mapping */
> > -			mapping = page_mapping(page);
> > +				/* Adding to swap updated mapping */
> > +				mapping = page_mapping(page);
> > +			} else {
> > +				if (unlikely(PageTransHuge(page))) {
> > +					if (unlikely(split_huge_page_to_list(
> > +						page, page_list)))
> > +						goto keep_locked;
> 
> Hm. It would be better to free the huge page without splitting. 
> It shouldn't be a big deal: walk over rmap and zap all pmds.
> Or I miss something?

Actually, I did but found no problem except CONFIG_DEBUG_VM but rollback
after peeking [1].
When I read the description in detail by your review, I think we can remove
BUG_ON(PageTransHuge(page)) in try_to_unmap and go with no split for lazyfree
page because they are not in swapcache any more so the assumption of [1] is
not valid. Will do it in next revision.

Thanks for the review, Kirill!

[1] thp: split_huge_page paging, 3f04f62f9


> 
> > +				}
> > +			}
> >  		}
> >  
> >  		/*
> > -- 
> > 2.0.0
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-api" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> -- 
>  Kirill A. Shutemov
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Kind regards,
Minchan Kim

next prev parent reply	other threads:[~2014-07-08  1:30 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-07-07  0:53 [PATCH v10 0/7] MADV_FREE support Minchan Kim
2014-07-07  0:53 ` Minchan Kim
2014-07-07  0:53 ` Minchan Kim
2014-07-07  0:53 ` [PATCH v10 1/7] mm: support madvise(MADV_FREE) Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07 10:41   ` Kirill A. Shutemov
2014-07-07 10:41     ` Kirill A. Shutemov
2014-07-08  0:36     ` Minchan Kim
2014-07-08  0:36       ` Minchan Kim
     [not found]   ` <1404694438-10272-2-git-send-email-minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2014-07-08  3:54     ` Zhang Yanfei
2014-07-08  3:54       ` Zhang Yanfei
2014-07-08  3:54       ` Zhang Yanfei
     [not found]       ` <53BB6B64.1080807-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2014-07-08  4:45         ` Minchan Kim
2014-07-08  4:45           ` Minchan Kim
2014-07-08  4:45           ` Minchan Kim
2014-07-07  0:53 ` [PATCH v10 2/7] x86: add pmd_[dirty|mkclean] for THP Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07 10:44   ` Kirill A. Shutemov
2014-07-07 10:44     ` Kirill A. Shutemov
2014-07-07  0:53 ` [PATCH v10 3/7] sparc: " Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07  0:53 ` [PATCH v10 4/7] powerpc: " Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07  0:53 ` [PATCH v10 5/7] s390: " Minchan Kim
2014-07-07  0:53   ` Minchan Kim
     [not found]   ` <1404694438-10272-6-git-send-email-minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2014-07-07 15:31     ` Gerald Schaefer
2014-07-07 15:31       ` Gerald Schaefer
2014-07-07 15:31       ` Gerald Schaefer
2014-07-07  0:53 ` [PATCH v10 6/7] ARM: " Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07  0:53   ` Minchan Kim
     [not found]   ` <1404694438-10272-7-git-send-email-minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2014-07-07  9:12     ` Will Deacon
2014-07-07  9:12       ` Will Deacon
2014-07-07  9:12       ` Will Deacon
2014-07-07  9:12       ` Will Deacon
2014-07-07  9:22     ` Steve Capper
2014-07-07  9:22       ` Steve Capper
2014-07-07  9:22       ` Steve Capper
2014-07-07  9:22       ` Steve Capper
2014-07-08  1:09       ` Minchan Kim
2014-07-08  1:09         ` Minchan Kim
2014-07-07  0:53 ` [PATCH v10 7/7] mm: Don't split THP page when syscall is called Minchan Kim
2014-07-07  0:53   ` Minchan Kim
2014-07-07 11:13   ` Kirill A. Shutemov
2014-07-07 11:13     ` Kirill A. Shutemov
     [not found]     ` <20140707111303.GC23150-nhfs4B5ZimeFUdmeq17FyvUpdFzICT1y@public.gmane.org>
2014-07-08  1:30       ` Minchan Kim [this message]
2014-07-08  1:30         ` Minchan Kim
2014-07-08  1:30         ` Minchan Kim
2014-07-08  6:10         ` Minchan Kim
2014-07-08  6:10           ` Minchan Kim
2014-07-08  6:10           ` Minchan Kim
2014-07-08  9:48         ` Kirill A. Shutemov
2014-07-08  9:48           ` Kirill A. Shutemov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140708013038.GD6076@bbox \
    --to=minchan-dgejt+ai2ygdnm+yrofe0a@public.gmane.org \
    --cc=akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org \
    --cc=hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org \
    --cc=hughd-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org \
    --cc=je-b10kYP2dOMg@public.gmane.org \
    --cc=kirill-oKw7cIdHH8eLwutG50LtGA@public.gmane.org \
    --cc=kosaki.motohiro-+CUm20s59erQFUHtdCDX3A@public.gmane.org \
    --cc=linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org \
    --cc=mgorman-l3A5Bk7waGM@public.gmane.org \
    --cc=mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org \
    --cc=riel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=zhangyanfei-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.