All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrea Arcangeli <aarcange@redhat.com>
To: Johannes Weiner <jweiner@redhat.com>
Cc: linux-mm@kvack.org, Mel Gorman <mel@csn.ul.ie>,
	Rik van Riel <riel@redhat.com>
Subject: Re: [PATCH] thp: mremap support and TLB optimization
Date: Tue, 15 Mar 2011 11:01:07 +0100	[thread overview]
Message-ID: <20110315100107.GI10696@random.random> (raw)
In-Reply-To: <20110315092750.GD2140@redhat.com>

On Tue, Mar 15, 2011 at 10:27:50AM +0100, Johannes Weiner wrote:
> On Fri, Mar 11, 2011 at 03:04:10AM +0100, Andrea Arcangeli wrote:
> > @@ -42,7 +42,7 @@ static pmd_t *get_old_pmd(struct mm_stru
> >  
> >  	pmd = pmd_offset(pud, addr);
> >  	split_huge_page_pmd(mm, pmd);
> 
> Wasn't getting rid of this line the sole purpose of the patch? :)

Leftover that should have been deleted right...

> > +		if (pmd_trans_huge(*old_pmd)) {
> > +			int err = move_huge_pmd(vma, old_addr, new_addr,
> > +						old_end, old_pmd, new_pmd);
> > +			if (err > 0) {
> > +				old_addr += HPAGE_PMD_SIZE;
> > +				new_addr += HPAGE_PMD_SIZE;
> > +				continue;
> > +			}
> > +		}
> > +		/*
> > +		 * split_huge_page_pmd() must run outside the
> > +		 * pmd_trans_huge() block above because that check
> > +		 * racy. split_huge_page_pmd() will recheck
> > +		 * pmd_trans_huge() but in a not racy way under the
> > +		 * page_table_lock.
> > +		 */
> > +		split_huge_page_pmd(vma->vm_mm, old_pmd);
> 
> I don't understand what we are racing here against.  If we see a huge
> pmd, it may split.  But we hold mmap_sem in write-mode, I don't see
> how a regular pmd could become huge all of a sudden at this point.

Agreed, in fact it runs it without the lock too...

Does this look any better? This also optimizes away the tlb flush for
totally uninitialized areas.

===
Subject: thp: mremap support and TLB optimization

From: Andrea Arcangeli <aarcange@redhat.com>

This adds THP support to mremap (decreases the number of split_huge_page
called).

This also replaces ptep_clear_flush with ptep_get_and_clear and replaces it
with a final flush_tlb_range to send a single tlb flush IPI instead of one IPI
for each page.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 include/linux/huge_mm.h |    3 +++
 mm/huge_memory.c        |   38 ++++++++++++++++++++++++++++++++++++++
 mm/mremap.c             |   29 +++++++++++++++++++++--------
 3 files changed, 62 insertions(+), 8 deletions(-)

--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -22,6 +22,9 @@ extern int zap_huge_pmd(struct mmu_gathe
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
+extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+			 unsigned long new_addr, unsigned long old_end,
+			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, pgprot_t newprot);
 
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_stru
 		return NULL;
 
 	pmd = pmd_offset(pud, addr);
-	split_huge_page_pmd(mm, pmd);
-	if (pmd_none_or_clear_bad(pmd))
+	if (pmd_none(*pmd))
 		return NULL;
 
 	return pmd;
@@ -80,11 +79,7 @@ static void move_ptes(struct vm_area_str
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
-	unsigned long old_start;
 
-	old_start = old_addr;
-	mmu_notifier_invalidate_range_start(vma->vm_mm,
-					    old_start, old_end);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
@@ -112,7 +107,7 @@ static void move_ptes(struct vm_area_str
 				   new_pte++, new_addr += PAGE_SIZE) {
 		if (pte_none(*old_pte))
 			continue;
-		pte = ptep_clear_flush(vma, old_addr, old_pte);
+		pte = ptep_get_and_clear(mm, old_addr, old_pte);
 		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
@@ -124,7 +119,6 @@ static void move_ptes(struct vm_area_str
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
-	mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
@@ -135,10 +129,13 @@ unsigned long move_page_tables(struct vm
 {
 	unsigned long extent, next, old_end;
 	pmd_t *old_pmd, *new_pmd;
+	bool need_flush = false;
 
 	old_end = old_addr + len;
 	flush_cache_range(vma, old_addr, old_end);
 
+	mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+
 	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
 		cond_resched();
 		next = (old_addr + PMD_SIZE) & PMD_MASK;
@@ -151,6 +148,18 @@ unsigned long move_page_tables(struct vm
 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 		if (!new_pmd)
 			break;
+		need_flush = true;
+		if (pmd_trans_huge(*old_pmd)) {
+			int err = move_huge_pmd(vma, old_addr, new_addr,
+						old_end, old_pmd, new_pmd);
+			if (err > 0) {
+				old_addr += HPAGE_PMD_SIZE;
+				new_addr += HPAGE_PMD_SIZE;
+				continue;
+			} else if (!err)
+				__split_huge_page_pmd(vma->vm_mm, old_pmd);
+			VM_BUG_ON(pmd_trans_huge(*old_pmd));
+		}
 		next = (new_addr + PMD_SIZE) & PMD_MASK;
 		if (extent > next - new_addr)
 			extent = next - new_addr;
@@ -159,6 +168,10 @@ unsigned long move_page_tables(struct vm
 		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
 				new_vma, new_pmd, new_addr);
 	}
+	if (likely(need_flush))
+		flush_tlb_range(vma, old_end-len, old_addr);
+
+	mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
 
 	return len + old_addr - old_end;	/* how much done */
 }
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1048,6 +1048,44 @@ int mincore_huge_pmd(struct vm_area_stru
 	return ret;
 }
 
+int move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+		  unsigned long new_addr, unsigned long old_end,
+		  pmd_t *old_pmd, pmd_t *new_pmd)
+{
+	int ret = 0;
+	pmd_t pmd;
+
+	struct mm_struct *mm = vma->vm_mm;
+
+	if ((old_addr & ~HPAGE_PMD_MASK) ||
+	    (new_addr & ~HPAGE_PMD_MASK) ||
+	    (old_addr + HPAGE_PMD_SIZE) > old_end)
+		goto out;
+
+	/* if the new area is all for our destination it must be unmapped */
+	VM_BUG_ON(!pmd_none(*new_pmd));
+	/* mostly to remember this locking isn't enough with filebacked vma */
+	VM_BUG_ON(vma->vm_file);
+
+	spin_lock(&mm->page_table_lock);
+	if (likely(pmd_trans_huge(*old_pmd))) {
+		if (pmd_trans_splitting(*old_pmd)) {
+			spin_unlock(&vma->vm_mm->page_table_lock);
+			wait_split_huge_page(vma->anon_vma, old_pmd);
+			ret = -1;
+		} else {
+			pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+			set_pmd_at(mm, new_addr, new_pmd, pmd);
+			spin_unlock(&mm->page_table_lock);
+			ret = 1;
+		}
+	} else
+		spin_unlock(&mm->page_table_lock);
+
+out:
+	return ret;
+}
+
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, pgprot_t newprot)
 {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2011-03-15 10:01 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-03-11  2:04 [PATCH] thp: mremap support and TLB optimization Andrea Arcangeli
2011-03-11 15:16 ` Rik van Riel
2011-03-11 19:44 ` Hugh Dickins
2011-03-11 20:25   ` Hugh Dickins
2011-03-12  4:28     ` Andrea Arcangeli
2011-03-12  4:02   ` Andrea Arcangeli
2011-03-15  9:27 ` Johannes Weiner
2011-03-15 10:01   ` Andrea Arcangeli [this message]
2011-03-15 12:07     ` Johannes Weiner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110315100107.GI10696@random.random \
    --to=aarcange@redhat.com \
    --cc=jweiner@redhat.com \
    --cc=linux-mm@kvack.org \
    --cc=mel@csn.ul.ie \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.