* [PATCH v3] mm: incorporate read-only pages into transparent huge pages
@ 2015-01-27 17:39 Ebru Akagunduz
  2015-01-28  0:27 ` Andrea Arcangeli
  2015-01-28 13:57 ` Zhang Yanfei
  0 siblings, 2 replies; 5+ messages in thread
From: Ebru Akagunduz @ 2015-01-27 17:39 UTC (permalink / raw)
  To: linux-mm
  Cc: akpm, kirill, mhocko, mgorman, rientjes, sasha.levin, hughd,
	hannes, vbabka, linux-kernel, riel, aarcange, zhangyanfei.linux,
	Ebru Akagunduz
This patch aims to improve THP collapse rates, by allowing
THP collapse in the presence of read-only ptes, like those
left in place by do_swap_page after a read fault.
Currently THP can collapse 4kB pages into a THP when
there are up to khugepaged_max_ptes_none pte_none ptes
in a 2MB range. This patch applies the same limit for
read-only ptes.
The patch was tested with a test program that allocates
800MB of memory, writes to it, and then sleeps. I force
the system to swap out all but 190MB of the program by
touching other memory. Afterwards, the test program does
a mix of reads and writes to its memory, and the memory
gets swapped back in.
Without the patch, only the memory that did not get
swapped out remained in THPs, which corresponds to 24% of
the memory of the program. The percentage did not increase
over time.
With this patch, after 5 minutes of waiting khugepaged had
collapsed 50% of the program's memory back into THPs.
Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
Changes in v2:
 - Remove extra code indent (Vlastimil Babka)
 - Add comment line for check condition of page_count() (Vlastimil Babka)
 - Add fast path optimistic check to
   __collapse_huge_page_isolate() (Andrea Arcangeli)
 - Move check condition of page_count() below to trylock_page() (Andrea Arcangeli)
Changes in v3:
 - Add a at-least-one-writable-pte check (Zhang Yanfei)
 - Debug page count (Vlastimil Babka, Andrea Arcangeli)
 - Increase read-only pte counter if pte is none (Andrea Arcangeli)
I've written down test results:
With the patch:
After swapped out:
cat /proc/pid/smaps:
Anonymous:      100464 kB
AnonHugePages:  100352 kB
Swap:           699540 kB
Fraction:       99,88
cat /proc/meminfo:
AnonPages:      1754448 kB
AnonHugePages:  1716224 kB
Fraction:       97,82
After swapped in:
In a few seconds:
cat /proc/pid/smaps:
Anonymous:      800004 kB
AnonHugePages:  145408 kB
Swap:           0 kB
Fraction:       18,17
cat /proc/meminfo:
AnonPages:      2455016 kB
AnonHugePages:  1761280 kB
Fraction:       71,74
In 5 minutes:
cat /proc/pid/smaps
Anonymous:      800004 kB
AnonHugePages:  407552 kB
Swap:           0 kB
Fraction:       50,94
cat /proc/meminfo:
AnonPages:      2456872 kB
AnonHugePages:  2023424 kB
Fraction:       82,35
Without the patch:
After swapped out:
cat /proc/pid/smaps:
Anonymous:      190660 kB
AnonHugePages:  190464 kB
Swap:           609344 kB
Fraction:       99,89
cat /proc/meminfo:
AnonPages:      1740456 kB
AnonHugePages:  1667072 kB
Fraction:       95,78
After swapped in:
cat /proc/pid/smaps:
Anonymous:      800004 kB
AnonHugePages:  190464 kB
Swap:           0 kB
Fraction:       23,80
cat /proc/meminfo:
AnonPages:      2350032 kB
AnonHugePages:  1667072 kB
Fraction:       70,93
I waited 10 minutes the fractions
did not change without the patch.
 mm/huge_memory.c | 60 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 11 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875..17d6e59 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2148,17 +2148,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
 	struct page *page;
 	pte_t *_pte;
-	int referenced = 0, none = 0;
+	int referenced = 0, none = 0, ro = 0, writable = 0;
 	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
 	     _pte++, address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (pte_none(pteval)) {
+			ro++;
 			if (++none <= khugepaged_max_ptes_none)
 				continue;
 			else
 				goto out;
 		}
-		if (!pte_present(pteval) || !pte_write(pteval))
+		if (!pte_present(pteval))
 			goto out;
 		page = vm_normal_page(vma, address, pteval);
 		if (unlikely(!page))
@@ -2168,9 +2169,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
 		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
-		/* cannot use mapcount: can't collapse if there's a gup pin */
-		if (page_count(page) != 1)
-			goto out;
 		/*
 		 * We can do it before isolate_lru_page because the
 		 * page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2177,34 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		 */
 		if (!trylock_page(page))
 			goto out;
+
+		/*
+		 * cannot use mapcount: can't collapse if there's a gup pin.
+		 * The page must only be referenced by the scanned process
+		 * and page swap cache.
+		 */
+		if (page_count(page) != 1 + !!PageSwapCache(page)) {
+			unlock_page(page);
+			goto out;
+		}
+		if (!pte_write(pteval)) {
+			if (++ro > khugepaged_max_ptes_none) {
+				unlock_page(page);
+				goto out;
+			}
+			if (PageSwapCache(page) && !reuse_swap_page(page)) {
+				unlock_page(page);
+				goto out;
+			}
+			/*
+			 * Page is not in the swap cache, and page count is
+			 * one (see above). It can be collapsed into a THP.
+			 */
+			VM_BUG_ON(page_count(page) != 1);
+		} else {
+			writable = 1;
+		}
+
 		/*
 		 * Isolate the page to avoid collapsing an hugepage
 		 * currently in use by the VM.
@@ -2197,7 +2223,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
-	if (likely(referenced))
+	if (likely(referenced && writable))
 		return 1;
 out:
 	release_pte_pages(pte, _pte);
@@ -2550,7 +2576,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
-	int ret = 0, referenced = 0, none = 0;
+	int ret = 0, referenced = 0, none = 0, ro = 0, writable = 0;
 	struct page *page;
 	unsigned long _address;
 	spinlock_t *ptl;
@@ -2568,13 +2594,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	     _pte++, _address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (pte_none(pteval)) {
+			ro++;
 			if (++none <= khugepaged_max_ptes_none)
 				continue;
 			else
 				goto out_unmap;
 		}
-		if (!pte_present(pteval) || !pte_write(pteval))
+		if (!pte_present(pteval))
 			goto out_unmap;
+		if (!pte_write(pteval)) {
+			if (++ro > khugepaged_max_ptes_none)
+				goto out_unmap;
+		} else {
+			writable = 1;
+		}
+
 		page = vm_normal_page(vma, _address, pteval);
 		if (unlikely(!page))
 			goto out_unmap;
@@ -2591,14 +2625,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		VM_BUG_ON_PAGE(PageCompound(page), page);
 		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
 			goto out_unmap;
-		/* cannot use mapcount: can't collapse if there's a gup pin */
-		if (page_count(page) != 1)
+		/*
+		 * cannot use mapcount: can't collapse if there's a gup pin.
+		 * The page must only be referenced by the scanned process
+		 * and page swap cache.
+		 */
+		if (page_count(page) != 1 + !!PageSwapCache(page))
 			goto out_unmap;
 		if (pte_young(pteval) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
-	if (referenced)
+	if (referenced && writable)
 		ret = 1;
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
-- 
1.9.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related	[flat|nested] 5+ messages in thread- * Re: [PATCH v3] mm: incorporate read-only pages into transparent huge pages
  2015-01-27 17:39 [PATCH v3] mm: incorporate read-only pages into transparent huge pages Ebru Akagunduz
@ 2015-01-28  0:27 ` Andrea Arcangeli
  2015-01-28  9:13   ` Vlastimil Babka
  2015-01-28 13:51   ` Zhang Yanfei
  2015-01-28 13:57 ` Zhang Yanfei
  1 sibling, 2 replies; 5+ messages in thread
From: Andrea Arcangeli @ 2015-01-28  0:27 UTC (permalink / raw)
  To: Ebru Akagunduz
  Cc: linux-mm, akpm, kirill, mhocko, mgorman, rientjes, sasha.levin,
	hughd, hannes, vbabka, linux-kernel, riel, zhangyanfei.linux
On Tue, Jan 27, 2015 at 07:39:13PM +0200, Ebru Akagunduz wrote:
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 817a875..17d6e59 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2148,17 +2148,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  {
>  	struct page *page;
>  	pte_t *_pte;
> -	int referenced = 0, none = 0;
> +	int referenced = 0, none = 0, ro = 0, writable = 0;
So your "writable" addition is enough and simpler/better than "ro"
counting. Once "ro" is removed "writable" can actually start to make a
difference (at the moment it does not).
I'd suggest to remove "ro".
The sysctl was there only to reduce the memory footprint but
collapsing readonly swapcache won't reduce the memory footprint. So it
may have been handy before but this new "writable" looks better now
and keeping both doesn't help (keeping "ro" around prevents "writable"
to make a difference).
> @@ -2179,6 +2177,34 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		 */
>  		if (!trylock_page(page))
>  			goto out;
> +
> +		/*
> +		 * cannot use mapcount: can't collapse if there's a gup pin.
> +		 * The page must only be referenced by the scanned process
> +		 * and page swap cache.
> +		 */
> +		if (page_count(page) != 1 + !!PageSwapCache(page)) {
> +			unlock_page(page);
> +			goto out;
> +		}
> +		if (!pte_write(pteval)) {
> +			if (++ro > khugepaged_max_ptes_none) {
> +				unlock_page(page);
> +				goto out;
> +			}
> +			if (PageSwapCache(page) && !reuse_swap_page(page)) {
> +				unlock_page(page);
> +				goto out;
> +			}
> +			/*
> +			 * Page is not in the swap cache, and page count is
> +			 * one (see above). It can be collapsed into a THP.
> +			 */
> +			VM_BUG_ON(page_count(page) != 1);
In an earlier email I commented on this suggestion you received during
previous code review: the VM_BUG_ON is not ok because it can generate
false positives.
It's perfectly ok if page_count is not 1 if the page is isolated by
another CPU (another cpu calling isolate_lru_page).
The page_count check there is to ensure there are no gup-pins, and
that is achieved during the check. The VM may still mangle the
page_count and it's ok (the page count taken by the VM running in
another CPU doesn't need to be transferred to the collapsed THP).
In short, the check "page_count(page) != 1 + !!PageSwapCache(page)"
doesn't imply that the page_count cannot change. It only means at any
given time there was no gup-pin at the very time of the check. It also
means there were no other VM pin, but what we care about is only the
gup-pin. The VM LRU pin can still be taken after the check and it's
ok. The GUP pin cannot be taken because we stopped all gup so we're
safe if the check passes.
So you can simply delete the VM_BUG_ON, the earlier code there, was fine.
> +		} else {
> +			writable = 1;
> +		}
> +
I suggest to make writable a bool and use writable = false to init,
and writable = true above.
When a value can only be 0|1 bool is better (it can be casted and
takes the same memory as an int, it just allows the compiler to be
more strict and the fact it makes the code more self explanatory).
> +			if (++ro > khugepaged_max_ptes_none)
> +				goto out_unmap;
As mentioned above the ro counting can go, and we can keep only
your new writable addition, as mentioned above.
Thanks,
Andrea
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 5+ messages in thread
- * Re: [PATCH v3] mm: incorporate read-only pages into transparent huge pages
  2015-01-28  0:27 ` Andrea Arcangeli
@ 2015-01-28  9:13   ` Vlastimil Babka
  2015-01-28 13:51   ` Zhang Yanfei
  1 sibling, 0 replies; 5+ messages in thread
From: Vlastimil Babka @ 2015-01-28  9:13 UTC (permalink / raw)
  To: Andrea Arcangeli, Ebru Akagunduz
  Cc: linux-mm, akpm, kirill, mhocko, mgorman, rientjes, sasha.levin,
	hughd, hannes, linux-kernel, riel, zhangyanfei.linux
On 01/28/2015 01:27 AM, Andrea Arcangeli wrote:
> On Tue, Jan 27, 2015 at 07:39:13PM +0200, Ebru Akagunduz wrote:
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 817a875..17d6e59 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2148,17 +2148,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>   {
>>   	struct page *page;
>>   	pte_t *_pte;
>> -	int referenced = 0, none = 0;
>> +	int referenced = 0, none = 0, ro = 0, writable = 0;
>
> So your "writable" addition is enough and simpler/better than "ro"
> counting. Once "ro" is removed "writable" can actually start to make a
> difference (at the moment it does not).
>
> I'd suggest to remove "ro".
>
> The sysctl was there only to reduce the memory footprint but
> collapsing readonly swapcache won't reduce the memory footprint. So it
> may have been handy before but this new "writable" looks better now
> and keeping both doesn't help (keeping "ro" around prevents "writable"
> to make a difference).
Agree.
>> @@ -2179,6 +2177,34 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>   		 */
>>   		if (!trylock_page(page))
>>   			goto out;
>> +
>> +		/*
>> +		 * cannot use mapcount: can't collapse if there's a gup pin.
>> +		 * The page must only be referenced by the scanned process
>> +		 * and page swap cache.
>> +		 */
>> +		if (page_count(page) != 1 + !!PageSwapCache(page)) {
>> +			unlock_page(page);
>> +			goto out;
>> +		}
>> +		if (!pte_write(pteval)) {
>> +			if (++ro > khugepaged_max_ptes_none) {
>> +				unlock_page(page);
>> +				goto out;
>> +			}
>> +			if (PageSwapCache(page) && !reuse_swap_page(page)) {
>> +				unlock_page(page);
>> +				goto out;
>> +			}
>> +			/*
>> +			 * Page is not in the swap cache, and page count is
>> +			 * one (see above). It can be collapsed into a THP.
>> +			 */
>> +			VM_BUG_ON(page_count(page) != 1);
>
> In an earlier email I commented on this suggestion you received during
> previous code review: the VM_BUG_ON is not ok because it can generate
> false positives.
>
> It's perfectly ok if page_count is not 1 if the page is isolated by
> another CPU (another cpu calling isolate_lru_page).
>
> The page_count check there is to ensure there are no gup-pins, and
> that is achieved during the check. The VM may still mangle the
> page_count and it's ok (the page count taken by the VM running in
> another CPU doesn't need to be transferred to the collapsed THP).
>
> In short, the check "page_count(page) != 1 + !!PageSwapCache(page)"
> doesn't imply that the page_count cannot change. It only means at any
> given time there was no gup-pin at the very time of the check. It also
> means there were no other VM pin, but what we care about is only the
> gup-pin. The VM LRU pin can still be taken after the check and it's
> ok. The GUP pin cannot be taken because we stopped all gup so we're
> safe if the check passes.
>
> So you can simply delete the VM_BUG_ON, the earlier code there, was fine.
There's still the comment that's IMHO misleading in light of your 
explanation:
/*
  * Page is not in the swap cache, and page count is
  * one (see above). It can be collapsed into a THP.
  */
Maybe just delete it too.
>
>> +		} else {
>> +			writable = 1;
>> +		}
>> +
>
> I suggest to make writable a bool and use writable = false to init,
> and writable = true above.
>
> When a value can only be 0|1 bool is better (it can be casted and
> takes the same memory as an int, it just allows the compiler to be
> more strict and the fact it makes the code more self explanatory).
While at it, "referenced" is also used only as a bool, so convert it to 
bool as well?
>> +			if (++ro > khugepaged_max_ptes_none)
>> +				goto out_unmap;
>
> As mentioned above the ro counting can go, and we can keep only
> your new writable addition, as mentioned above.
>
> Thanks,
> Andrea
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 5+ messages in thread
- * Re: [PATCH v3] mm: incorporate read-only pages into transparent huge pages
  2015-01-28  0:27 ` Andrea Arcangeli
  2015-01-28  9:13   ` Vlastimil Babka
@ 2015-01-28 13:51   ` Zhang Yanfei
  1 sibling, 0 replies; 5+ messages in thread
From: Zhang Yanfei @ 2015-01-28 13:51 UTC (permalink / raw)
  To: Andrea Arcangeli, Ebru Akagunduz
  Cc: linux-mm, akpm, kirill, mhocko, mgorman, rientjes, sasha.levin,
	hughd, hannes, vbabka, linux-kernel, riel, zhangyanfei.linux
Hello
在 2015/1/28 8:27, Andrea Arcangeli 写道:
> On Tue, Jan 27, 2015 at 07:39:13PM +0200, Ebru Akagunduz wrote:
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 817a875..17d6e59 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2148,17 +2148,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>  {
>>  	struct page *page;
>>  	pte_t *_pte;
>> -	int referenced = 0, none = 0;
>> +	int referenced = 0, none = 0, ro = 0, writable = 0;
> So your "writable" addition is enough and simpler/better than "ro"
> counting. Once "ro" is removed "writable" can actually start to make a
> difference (at the moment it does not).
>
> I'd suggest to remove "ro".
>
> The sysctl was there only to reduce the memory footprint but
> collapsing readonly swapcache won't reduce the memory footprint. So it
> may have been handy before but this new "writable" looks better now
> and keeping both doesn't help (keeping "ro" around prevents "writable"
> to make a difference).
Agreed.
>
>> @@ -2179,6 +2177,34 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>>  		 */
>>  		if (!trylock_page(page))
>>  			goto out;
>> +
>> +		/*
>> +		 * cannot use mapcount: can't collapse if there's a gup pin.
>> +		 * The page must only be referenced by the scanned process
>> +		 * and page swap cache.
>> +		 */
>> +		if (page_count(page) != 1 + !!PageSwapCache(page)) {
>> +			unlock_page(page);
>> +			goto out;
>> +		}
>> +		if (!pte_write(pteval)) {
>> +			if (++ro > khugepaged_max_ptes_none) {
>> +				unlock_page(page);
>> +				goto out;
>> +			}
>> +			if (PageSwapCache(page) && !reuse_swap_page(page)) {
>> +				unlock_page(page);
>> +				goto out;
>> +			}
>> +			/*
>> +			 * Page is not in the swap cache, and page count is
>> +			 * one (see above). It can be collapsed into a THP.
>> +			 */
>> +			VM_BUG_ON(page_count(page) != 1);
> In an earlier email I commented on this suggestion you received during
> previous code review: the VM_BUG_ON is not ok because it can generate
> false positives.
>
> It's perfectly ok if page_count is not 1 if the page is isolated by
> another CPU (another cpu calling isolate_lru_page).
>
> The page_count check there is to ensure there are no gup-pins, and
> that is achieved during the check. The VM may still mangle the
> page_count and it's ok (the page count taken by the VM running in
> another CPU doesn't need to be transferred to the collapsed THP).
>
> In short, the check "page_count(page) != 1 + !!PageSwapCache(page)"
> doesn't imply that the page_count cannot change. It only means at any
> given time there was no gup-pin at the very time of the check. It also
> means there were no other VM pin, but what we care about is only the
> gup-pin. The VM LRU pin can still be taken after the check and it's
> ok. The GUP pin cannot be taken because we stopped all gup so we're
> safe if the check passes.
>
> So you can simply delete the VM_BUG_ON, the earlier code there, was fine.
So IMO, the comment should also be removed or changed as it may
mislead someone again later.
Thanks
Zhang
>
>> +		} else {
>> +			writable = 1;
>> +		}
>> +
> I suggest to make writable a bool and use writable = false to init,
> and writable = true above.
>
> When a value can only be 0|1 bool is better (it can be casted and
> takes the same memory as an int, it just allows the compiler to be
> more strict and the fact it makes the code more self explanatory).
>
>> +			if (++ro > khugepaged_max_ptes_none)
>> +				goto out_unmap;
> As mentioned above the ro counting can go, and we can keep only
> your new writable addition, as mentioned above.
>
> Thanks,
> Andrea
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 5+ messages in thread
 
- * Re: [PATCH v3] mm: incorporate read-only pages into transparent huge pages
  2015-01-27 17:39 [PATCH v3] mm: incorporate read-only pages into transparent huge pages Ebru Akagunduz
  2015-01-28  0:27 ` Andrea Arcangeli
@ 2015-01-28 13:57 ` Zhang Yanfei
  1 sibling, 0 replies; 5+ messages in thread
From: Zhang Yanfei @ 2015-01-28 13:57 UTC (permalink / raw)
  To: Ebru Akagunduz, linux-mm
  Cc: akpm, kirill, mhocko, mgorman, rientjes, sasha.levin, hughd,
	hannes, vbabka, linux-kernel, riel, aarcange, zhangyanfei.linux
Hello
在 2015/1/28 1:39, Ebru Akagunduz 写道:
> This patch aims to improve THP collapse rates, by allowing
> THP collapse in the presence of read-only ptes, like those
> left in place by do_swap_page after a read fault.
>
> Currently THP can collapse 4kB pages into a THP when
> there are up to khugepaged_max_ptes_none pte_none ptes
> in a 2MB range. This patch applies the same limit for
> read-only ptes.
>
> The patch was tested with a test program that allocates
> 800MB of memory, writes to it, and then sleeps. I force
> the system to swap out all but 190MB of the program by
> touching other memory. Afterwards, the test program does
> a mix of reads and writes to its memory, and the memory
> gets swapped back in.
>
> Without the patch, only the memory that did not get
> swapped out remained in THPs, which corresponds to 24% of
> the memory of the program. The percentage did not increase
> over time.
>
> With this patch, after 5 minutes of waiting khugepaged had
> collapsed 50% of the program's memory back into THPs.
>
> Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
> Reviewed-by: Rik van Riel <riel@redhat.com>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
Please feel free to add:
Acked-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
> ---
> Changes in v2:
>  - Remove extra code indent (Vlastimil Babka)
>  - Add comment line for check condition of page_count() (Vlastimil Babka)
>  - Add fast path optimistic check to
>    __collapse_huge_page_isolate() (Andrea Arcangeli)
>  - Move check condition of page_count() below to trylock_page() (Andrea Arcangeli)
>
> Changes in v3:
>  - Add a at-least-one-writable-pte check (Zhang Yanfei)
>  - Debug page count (Vlastimil Babka, Andrea Arcangeli)
>  - Increase read-only pte counter if pte is none (Andrea Arcangeli)
>
> I've written down test results:
> With the patch:
> After swapped out:
> cat /proc/pid/smaps:
> Anonymous:      100464 kB
> AnonHugePages:  100352 kB
> Swap:           699540 kB
> Fraction:       99,88
>
> cat /proc/meminfo:
> AnonPages:      1754448 kB
> AnonHugePages:  1716224 kB
> Fraction:       97,82
>
> After swapped in:
> In a few seconds:
> cat /proc/pid/smaps:
> Anonymous:      800004 kB
> AnonHugePages:  145408 kB
> Swap:           0 kB
> Fraction:       18,17
>
> cat /proc/meminfo:
> AnonPages:      2455016 kB
> AnonHugePages:  1761280 kB
> Fraction:       71,74
>
> In 5 minutes:
> cat /proc/pid/smaps
> Anonymous:      800004 kB
> AnonHugePages:  407552 kB
> Swap:           0 kB
> Fraction:       50,94
>
> cat /proc/meminfo:
> AnonPages:      2456872 kB
> AnonHugePages:  2023424 kB
> Fraction:       82,35
>
> Without the patch:
> After swapped out:
> cat /proc/pid/smaps:
> Anonymous:      190660 kB
> AnonHugePages:  190464 kB
> Swap:           609344 kB
> Fraction:       99,89
>
> cat /proc/meminfo:
> AnonPages:      1740456 kB
> AnonHugePages:  1667072 kB
> Fraction:       95,78
>
> After swapped in:
> cat /proc/pid/smaps:
> Anonymous:      800004 kB
> AnonHugePages:  190464 kB
> Swap:           0 kB
> Fraction:       23,80
>
> cat /proc/meminfo:
> AnonPages:      2350032 kB
> AnonHugePages:  1667072 kB
> Fraction:       70,93
>
> I waited 10 minutes the fractions
> did not change without the patch.
>
>  mm/huge_memory.c | 60 +++++++++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 49 insertions(+), 11 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 817a875..17d6e59 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2148,17 +2148,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  {
>  	struct page *page;
>  	pte_t *_pte;
> -	int referenced = 0, none = 0;
> +	int referenced = 0, none = 0, ro = 0, writable = 0;
>  	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
>  	     _pte++, address += PAGE_SIZE) {
>  		pte_t pteval = *_pte;
>  		if (pte_none(pteval)) {
> +			ro++;
>  			if (++none <= khugepaged_max_ptes_none)
>  				continue;
>  			else
>  				goto out;
>  		}
> -		if (!pte_present(pteval) || !pte_write(pteval))
> +		if (!pte_present(pteval))
>  			goto out;
>  		page = vm_normal_page(vma, address, pteval);
>  		if (unlikely(!page))
> @@ -2168,9 +2169,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		VM_BUG_ON_PAGE(!PageAnon(page), page);
>  		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
>  
> -		/* cannot use mapcount: can't collapse if there's a gup pin */
> -		if (page_count(page) != 1)
> -			goto out;
>  		/*
>  		 * We can do it before isolate_lru_page because the
>  		 * page can't be freed from under us. NOTE: PG_lock
> @@ -2179,6 +2177,34 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		 */
>  		if (!trylock_page(page))
>  			goto out;
> +
> +		/*
> +		 * cannot use mapcount: can't collapse if there's a gup pin.
> +		 * The page must only be referenced by the scanned process
> +		 * and page swap cache.
> +		 */
> +		if (page_count(page) != 1 + !!PageSwapCache(page)) {
> +			unlock_page(page);
> +			goto out;
> +		}
> +		if (!pte_write(pteval)) {
> +			if (++ro > khugepaged_max_ptes_none) {
> +				unlock_page(page);
> +				goto out;
> +			}
> +			if (PageSwapCache(page) && !reuse_swap_page(page)) {
> +				unlock_page(page);
> +				goto out;
> +			}
> +			/*
> +			 * Page is not in the swap cache, and page count is
> +			 * one (see above). It can be collapsed into a THP.
> +			 */
> +			VM_BUG_ON(page_count(page) != 1);
> +		} else {
> +			writable = 1;
> +		}
> +
>  		/*
>  		 * Isolate the page to avoid collapsing an hugepage
>  		 * currently in use by the VM.
> @@ -2197,7 +2223,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		    mmu_notifier_test_young(vma->vm_mm, address))
>  			referenced = 1;
>  	}
> -	if (likely(referenced))
> +	if (likely(referenced && writable))
>  		return 1;
>  out:
>  	release_pte_pages(pte, _pte);
> @@ -2550,7 +2576,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  {
>  	pmd_t *pmd;
>  	pte_t *pte, *_pte;
> -	int ret = 0, referenced = 0, none = 0;
> +	int ret = 0, referenced = 0, none = 0, ro = 0, writable = 0;
>  	struct page *page;
>  	unsigned long _address;
>  	spinlock_t *ptl;
> @@ -2568,13 +2594,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  	     _pte++, _address += PAGE_SIZE) {
>  		pte_t pteval = *_pte;
>  		if (pte_none(pteval)) {
> +			ro++;
>  			if (++none <= khugepaged_max_ptes_none)
>  				continue;
>  			else
>  				goto out_unmap;
>  		}
> -		if (!pte_present(pteval) || !pte_write(pteval))
> +		if (!pte_present(pteval))
>  			goto out_unmap;
> +		if (!pte_write(pteval)) {
> +			if (++ro > khugepaged_max_ptes_none)
> +				goto out_unmap;
> +		} else {
> +			writable = 1;
> +		}
> +
>  		page = vm_normal_page(vma, _address, pteval);
>  		if (unlikely(!page))
>  			goto out_unmap;
> @@ -2591,14 +2625,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  		VM_BUG_ON_PAGE(PageCompound(page), page);
>  		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
>  			goto out_unmap;
> -		/* cannot use mapcount: can't collapse if there's a gup pin */
> -		if (page_count(page) != 1)
> +		/*
> +		 * cannot use mapcount: can't collapse if there's a gup pin.
> +		 * The page must only be referenced by the scanned process
> +		 * and page swap cache.
> +		 */
> +		if (page_count(page) != 1 + !!PageSwapCache(page))
>  			goto out_unmap;
>  		if (pte_young(pteval) || PageReferenced(page) ||
>  		    mmu_notifier_test_young(vma->vm_mm, address))
>  			referenced = 1;
>  	}
> -	if (referenced)
> +	if (referenced && writable)
>  		ret = 1;
>  out_unmap:
>  	pte_unmap_unlock(pte, ptl);
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply	[flat|nested] 5+ messages in thread
end of thread, other threads:[~2015-01-28 13:57 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-01-27 17:39 [PATCH v3] mm: incorporate read-only pages into transparent huge pages Ebru Akagunduz
2015-01-28  0:27 ` Andrea Arcangeli
2015-01-28  9:13   ` Vlastimil Babka
2015-01-28 13:51   ` Zhang Yanfei
2015-01-28 13:57 ` Zhang Yanfei
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).