[PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
@ 2016-01-06 22:37 Mike Kravetz
  2016-01-07  8:06 ` Hillf Danton
  2016-01-11 22:35 ` Andrew Morton
  0 siblings, 2 replies; 12+ messages in thread
From: Mike Kravetz @ 2016-01-06 22:37 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: Hugh Dickins, Naoya Horiguchi, Hillf Danton, Davidlohr Bueso,
	Dave Hansen, Andrew Morton, Mike Kravetz

Page faults can race with fallocate hole punch.  If a page fault happens
between the unmap and remove operations, the page is not removed and
remains within the hole.  This is not the desired behavior.  The race
is difficult to detect in user level code as even in the non-race
case, a page within the hole could be faulted back in before fallocate
returns.  If userfaultfd is expanded to support hugetlbfs in the future,
this race will be easier to observe.

If this race is detected and a page is mapped, the remove operation
(remove_inode_hugepages) will unmap the page before removing.  The unmap
within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
so that no other faults will be processed until the page is removed.

The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
remove_inode_hugepages to satisfy the new reference.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/hugetlbfs/inode.c | 139 +++++++++++++++++++++++++++------------------------
 1 file changed, 73 insertions(+), 66 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0444760..0871d70 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
 	delete_from_page_cache(page);
 }
 
+static inline void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+	struct vm_area_struct *vma;
+
+	/*
+	 * end == 0 indicates that the entire range after
+	 * start should be unmapped.
+	 */
+	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+		unsigned long v_offset;
+
+		/*
+		 * Can the expression below overflow on 32-bit arches?
+		 * No, because the interval tree returns us only those vmas
+		 * which overlap the truncated area starting at pgoff,
+		 * and no vma on a 32-bit arch can span beyond the 4GB.
+		 */
+		if (vma->vm_pgoff < start)
+			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+		else
+			v_offset = 0;
+
+		if (end) {
+			end = ((end - start) << PAGE_SHIFT) +
+			       vma->vm_start + v_offset;
+			if (end > vma->vm_end)
+				end = vma->vm_end;
+		} else
+			end = vma->vm_end;
+
+		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
+	}
+}
+
 
 /*
  * remove_inode_hugepages handles two distinct cases: truncation and hole
  * punch.  There are subtle differences in operation for each case.
-
+ *
  * truncation is indicated by end of range being LLONG_MAX
  *	In this case, we first scan the range and release found pages.
  *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +414,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
 			struct page *page = pvec.pages[i];
+			bool rsv_on_error;
 			u32 hash;
 
 			/*
@@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 							mapping, next, 0);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
-			lock_page(page);
-			if (likely(!page_mapped(page))) {
-				bool rsv_on_error = !PagePrivate(page);
-				/*
-				 * We must free the huge page and remove
-				 * from page cache (remove_huge_page) BEFORE
-				 * removing the region/reserve map
-				 * (hugetlb_unreserve_pages).  In rare out
-				 * of memory conditions, removal of the
-				 * region/reserve map could fail.  Before
-				 * free'ing the page, note PagePrivate which
-				 * is used in case of error.
-				 */
-				remove_huge_page(page);
-				freed++;
-				if (!truncate_op) {
-					if (unlikely(hugetlb_unreserve_pages(
-							inode, next,
-							next + 1, 1)))
-						hugetlb_fix_reserve_counts(
-							inode, rsv_on_error);
-				}
-			} else {
-				/*
-				 * If page is mapped, it was faulted in after
-				 * being unmapped.  It indicates a race between
-				 * hole punch and page fault.  Do nothing in
-				 * this case.  Getting here in a truncate
-				 * operation is a bug.
-				 */
+			/*
+			 * If page is mapped, it was faulted in after being
+			 * unmapped in caller.  Unmap (again) now after taking
+			 * the fault mutex.  The mutex will prevent faults
+			 * until we finish removing the page.
+			 *
+			 * This race can only happen in the hole punch case.
+			 * Getting here in a truncate operation is a bug.
+			 */
+			if (unlikely(page_mapped(page))) {
 				BUG_ON(truncate_op);
+
+				i_mmap_lock_write(mapping);
+				hugetlb_vmdelete_list(&mapping->i_mmap,
+					next * pages_per_huge_page(h),
+					(next + 1) * pages_per_huge_page(h));
+				i_mmap_unlock_write(mapping);
+			}
+
+			lock_page(page);
+			/*
+			 * We must free the huge page and remove from page
+			 * cache (remove_huge_page) BEFORE removing the
+			 * region/reserve map (hugetlb_unreserve_pages).  In
+			 * rare out of memory conditions, removal of the
+			 * region/reserve map could fail.  Before free'ing
+			 * the page, note PagePrivate which is used in case
+			 * of error.
+			 */
+			rsv_on_error = !PagePrivate(page);
+			remove_huge_page(page);
+			freed++;
+			if (!truncate_op) {
+				if (unlikely(hugetlb_unreserve_pages(inode,
+							next, next + 1, 1)))
+					hugetlb_fix_reserve_counts(inode,
+								rsv_on_error);
 			}
 
 			unlock_page(page);
@@ -452,41 +494,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
-	struct vm_area_struct *vma;
-
-	/*
-	 * end == 0 indicates that the entire range after
-	 * start should be unmapped.
-	 */
-	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
-		unsigned long v_offset;
-
-		/*
-		 * Can the expression below overflow on 32-bit arches?
-		 * No, because the interval tree returns us only those vmas
-		 * which overlap the truncated area starting at pgoff,
-		 * and no vma on a 32-bit arch can span beyond the 4GB.
-		 */
-		if (vma->vm_pgoff < start)
-			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
-		else
-			v_offset = 0;
-
-		if (end) {
-			end = ((end - start) << PAGE_SHIFT) +
-			       vma->vm_start + v_offset;
-			if (end > vma->vm_end)
-				end = vma->vm_end;
-		} else
-			end = vma->vm_end;
-
-		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
-	}
-}
-
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
-- 
2.4.3

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-06 22:37 [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch Mike Kravetz
@ 2016-01-07  8:06 ` Hillf Danton
  2016-01-07 16:46   ` Mike Kravetz
  2016-01-08  4:39   ` Mike Kravetz
  2016-01-11 22:35 ` Andrew Morton
  1 sibling, 2 replies; 12+ messages in thread
From: Hillf Danton @ 2016-01-07  8:06 UTC (permalink / raw)
  To: 'Mike Kravetz', linux-kernel, linux-mm
  Cc: 'Hugh Dickins', 'Naoya Horiguchi',
	'Davidlohr Bueso', 'Dave Hansen',
	'Andrew Morton', 'Michel Lespinasse'

> 
> Page faults can race with fallocate hole punch.  If a page fault happens
> between the unmap and remove operations, the page is not removed and
> remains within the hole.  This is not the desired behavior.  The race
> is difficult to detect in user level code as even in the non-race
> case, a page within the hole could be faulted back in before fallocate
> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
> this race will be easier to observe.
> 
> If this race is detected and a page is mapped, the remove operation
> (remove_inode_hugepages) will unmap the page before removing.  The unmap
> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
> so that no other faults will be processed until the page is removed.
> 
> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
> remove_inode_hugepages to satisfy the new reference.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  fs/hugetlbfs/inode.c | 139 +++++++++++++++++++++++++++------------------------
>  1 file changed, 73 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 0444760..0871d70 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
>  	delete_from_page_cache(page);
>  }
> 
> +static inline void
> +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
> +{
> +	struct vm_area_struct *vma;
> +
> +	/*
> +	 * end == 0 indicates that the entire range after
> +	 * start should be unmapped.
> +	 */
> +	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {

[1] perhaps end can be reused.

> +		unsigned long v_offset;
> +
> +		/*
> +		 * Can the expression below overflow on 32-bit arches?
> +		 * No, because the interval tree returns us only those vmas
> +		 * which overlap the truncated area starting at pgoff,
> +		 * and no vma on a 32-bit arch can span beyond the 4GB.
> +		 */
> +		if (vma->vm_pgoff < start)
> +			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
> +		else
> +			v_offset = 0;
> +
> +		if (end) {
> +			end = ((end - start) << PAGE_SHIFT) +
> +			       vma->vm_start + v_offset;

[2] end is input to be pgoff_t, but changed to be the type of v_offset.
Further we cannot handle the case that end is input to be zero.
See the diff below please.

> +			if (end > vma->vm_end)
> +				end = vma->vm_end;
> +		} else
> +			end = vma->vm_end;
> +
> +		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> +	}
> +}
> +
> 
>  /*
>   * remove_inode_hugepages handles two distinct cases: truncation and hole
>   * punch.  There are subtle differences in operation for each case.
> -
> + *
>   * truncation is indicated by end of range being LLONG_MAX
>   *	In this case, we first scan the range and release found pages.
>   *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
> @@ -379,6 +414,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
> 
>  		for (i = 0; i < pagevec_count(&pvec); ++i) {
>  			struct page *page = pvec.pages[i];
> +			bool rsv_on_error;
>  			u32 hash;
> 
>  			/*
> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>  							mapping, next, 0);
>  			mutex_lock(&hugetlb_fault_mutex_table[hash]);
> 
> -			lock_page(page);
> -			if (likely(!page_mapped(page))) {
> -				bool rsv_on_error = !PagePrivate(page);
> -				/*
> -				 * We must free the huge page and remove
> -				 * from page cache (remove_huge_page) BEFORE
> -				 * removing the region/reserve map
> -				 * (hugetlb_unreserve_pages).  In rare out
> -				 * of memory conditions, removal of the
> -				 * region/reserve map could fail.  Before
> -				 * free'ing the page, note PagePrivate which
> -				 * is used in case of error.
> -				 */
> -				remove_huge_page(page);
> -				freed++;
> -				if (!truncate_op) {
> -					if (unlikely(hugetlb_unreserve_pages(
> -							inode, next,
> -							next + 1, 1)))
> -						hugetlb_fix_reserve_counts(
> -							inode, rsv_on_error);
> -				}
> -			} else {
> -				/*
> -				 * If page is mapped, it was faulted in after
> -				 * being unmapped.  It indicates a race between
> -				 * hole punch and page fault.  Do nothing in
> -				 * this case.  Getting here in a truncate
> -				 * operation is a bug.
> -				 */
> +			/*
> +			 * If page is mapped, it was faulted in after being
> +			 * unmapped in caller.  Unmap (again) now after taking
> +			 * the fault mutex.  The mutex will prevent faults
> +			 * until we finish removing the page.
> +			 *
> +			 * This race can only happen in the hole punch case.
> +			 * Getting here in a truncate operation is a bug.
> +			 */
> +			if (unlikely(page_mapped(page))) {
>  				BUG_ON(truncate_op);
> +
> +				i_mmap_lock_write(mapping);
> +				hugetlb_vmdelete_list(&mapping->i_mmap,
> +					next * pages_per_huge_page(h),
> +					(next + 1) * pages_per_huge_page(h));
> +				i_mmap_unlock_write(mapping);
> +			}
> +
> +			lock_page(page);
> +			/*
> +			 * We must free the huge page and remove from page
> +			 * cache (remove_huge_page) BEFORE removing the
> +			 * region/reserve map (hugetlb_unreserve_pages).  In
> +			 * rare out of memory conditions, removal of the
> +			 * region/reserve map could fail.  Before free'ing
> +			 * the page, note PagePrivate which is used in case
> +			 * of error.
> +			 */
> +			rsv_on_error = !PagePrivate(page);
> +			remove_huge_page(page);
> +			freed++;
> +			if (!truncate_op) {
> +				if (unlikely(hugetlb_unreserve_pages(inode,
> +							next, next + 1, 1)))
> +					hugetlb_fix_reserve_counts(inode,
> +								rsv_on_error);
>  			}
> 
>  			unlock_page(page);
> @@ -452,41 +494,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
>  	clear_inode(inode);
>  }
> 
> -static inline void
> -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
> -{
> -	struct vm_area_struct *vma;
> -
> -	/*
> -	 * end == 0 indicates that the entire range after
> -	 * start should be unmapped.
> -	 */
> -	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> -		unsigned long v_offset;
> -
> -		/*
> -		 * Can the expression below overflow on 32-bit arches?
> -		 * No, because the interval tree returns us only those vmas
> -		 * which overlap the truncated area starting at pgoff,
> -		 * and no vma on a 32-bit arch can span beyond the 4GB.
> -		 */
> -		if (vma->vm_pgoff < start)
> -			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
> -		else
> -			v_offset = 0;
> -
> -		if (end) {
> -			end = ((end - start) << PAGE_SHIFT) +
> -			       vma->vm_start + v_offset;
> -			if (end > vma->vm_end)
> -				end = vma->vm_end;
> -		} else
> -			end = vma->vm_end;
> -
> -		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> -	}
> -}
> -
>  static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
>  {
>  	pgoff_t pgoff;
> --
> 2.4.3
> 

--- a/fs/hugetlbfs/inode.c	Thu Jan  7 15:04:35 2016
+++ b/fs/hugetlbfs/inode.c	Thu Jan  7 15:31:03 2016
@@ -461,8 +461,11 @@ hugetlb_vmdelete_list(struct rb_root *ro
 	 * end == 0 indicates that the entire range after
 	 * start should be unmapped.
 	 */
-	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+	if (!end)
+		end = ULONG_MAX;
+	vma_interval_tree_foreach(vma, root, start, end) {
 		unsigned long v_offset;
+		unsigned long v_end;
 
 		/*
 		 * Can the expression below overflow on 32-bit arches?
@@ -475,15 +478,12 @@ hugetlb_vmdelete_list(struct rb_root *ro
 		else
 			v_offset = 0;
 
-		if (end) {
-			end = ((end - start) << PAGE_SHIFT) +
+		v_end = ((end - start) << PAGE_SHIFT) +
 			       vma->vm_start + v_offset;
-			if (end > vma->vm_end)
-				end = vma->vm_end;
-		} else
-			end = vma->vm_end;
+		if (v_end > vma->vm_end)
+			v_end = vma->vm_end;
 
-		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
+		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, NULL);
 	}
 }
 
--

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-07  8:06 ` Hillf Danton
@ 2016-01-07 16:46   ` Mike Kravetz
  2016-01-08  4:39   ` Mike Kravetz
  1 sibling, 0 replies; 12+ messages in thread
From: Mike Kravetz @ 2016-01-07 16:46 UTC (permalink / raw)
  To: Hillf Danton, linux-kernel, linux-mm
  Cc: 'Hugh Dickins', 'Naoya Horiguchi',
	'Davidlohr Bueso', 'Dave Hansen',
	'Andrew Morton', 'Michel Lespinasse'

On 01/07/2016 12:06 AM, Hillf Danton wrote:
>>
>> Page faults can race with fallocate hole punch.  If a page fault happens
>> between the unmap and remove operations, the page is not removed and
>> remains within the hole.  This is not the desired behavior.  The race
>> is difficult to detect in user level code as even in the non-race
>> case, a page within the hole could be faulted back in before fallocate
>> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
>> this race will be easier to observe.
>>
>> If this race is detected and a page is mapped, the remove operation
>> (remove_inode_hugepages) will unmap the page before removing.  The unmap
>> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
>> so that no other faults will be processed until the page is removed.
>>
>> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
>> remove_inode_hugepages to satisfy the new reference.
>>
>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>> ---
>>  fs/hugetlbfs/inode.c | 139 +++++++++++++++++++++++++++------------------------
>>  1 file changed, 73 insertions(+), 66 deletions(-)
>>
>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>> index 0444760..0871d70 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
>>  	delete_from_page_cache(page);
>>  }
>>
>> +static inline void
>> +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
>> +{
>> +	struct vm_area_struct *vma;
>> +
>> +	/*
>> +	 * end == 0 indicates that the entire range after
>> +	 * start should be unmapped.
>> +	 */
>> +	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> 
> [1] perhaps end can be reused.
> 
>> +		unsigned long v_offset;
>> +
>> +		/*
>> +		 * Can the expression below overflow on 32-bit arches?
>> +		 * No, because the interval tree returns us only those vmas
>> +		 * which overlap the truncated area starting at pgoff,
>> +		 * and no vma on a 32-bit arch can span beyond the 4GB.
>> +		 */
>> +		if (vma->vm_pgoff < start)
>> +			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
>> +		else
>> +			v_offset = 0;
>> +
>> +		if (end) {
>> +			end = ((end - start) << PAGE_SHIFT) +
>> +			       vma->vm_start + v_offset;
> 
> [2] end is input to be pgoff_t, but changed to be the type of v_offset.
> Further we cannot handle the case that end is input to be zero.
> See the diff below please.

Thanks Hillf.

This bug is part of the existing code.  I did not modify current
hugetlb_vmdelete_list code, just moved it as part of this patch.
Therefore, I will create a separate patch to fix this issue.

-- 
Mike Kravetz

> 
>> +			if (end > vma->vm_end)
>> +				end = vma->vm_end;
>> +		} else
>> +			end = vma->vm_end;
>> +
>> +		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
>> +	}
>> +}
>> +
>>
>>  /*
>>   * remove_inode_hugepages handles two distinct cases: truncation and hole
>>   * punch.  There are subtle differences in operation for each case.
>> -
>> + *
>>   * truncation is indicated by end of range being LLONG_MAX
>>   *	In this case, we first scan the range and release found pages.
>>   *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
>> @@ -379,6 +414,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>>
>>  		for (i = 0; i < pagevec_count(&pvec); ++i) {
>>  			struct page *page = pvec.pages[i];
>> +			bool rsv_on_error;
>>  			u32 hash;
>>
>>  			/*
>> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>>  							mapping, next, 0);
>>  			mutex_lock(&hugetlb_fault_mutex_table[hash]);
>>
>> -			lock_page(page);
>> -			if (likely(!page_mapped(page))) {
>> -				bool rsv_on_error = !PagePrivate(page);
>> -				/*
>> -				 * We must free the huge page and remove
>> -				 * from page cache (remove_huge_page) BEFORE
>> -				 * removing the region/reserve map
>> -				 * (hugetlb_unreserve_pages).  In rare out
>> -				 * of memory conditions, removal of the
>> -				 * region/reserve map could fail.  Before
>> -				 * free'ing the page, note PagePrivate which
>> -				 * is used in case of error.
>> -				 */
>> -				remove_huge_page(page);
>> -				freed++;
>> -				if (!truncate_op) {
>> -					if (unlikely(hugetlb_unreserve_pages(
>> -							inode, next,
>> -							next + 1, 1)))
>> -						hugetlb_fix_reserve_counts(
>> -							inode, rsv_on_error);
>> -				}
>> -			} else {
>> -				/*
>> -				 * If page is mapped, it was faulted in after
>> -				 * being unmapped.  It indicates a race between
>> -				 * hole punch and page fault.  Do nothing in
>> -				 * this case.  Getting here in a truncate
>> -				 * operation is a bug.
>> -				 */
>> +			/*
>> +			 * If page is mapped, it was faulted in after being
>> +			 * unmapped in caller.  Unmap (again) now after taking
>> +			 * the fault mutex.  The mutex will prevent faults
>> +			 * until we finish removing the page.
>> +			 *
>> +			 * This race can only happen in the hole punch case.
>> +			 * Getting here in a truncate operation is a bug.
>> +			 */
>> +			if (unlikely(page_mapped(page))) {
>>  				BUG_ON(truncate_op);
>> +
>> +				i_mmap_lock_write(mapping);
>> +				hugetlb_vmdelete_list(&mapping->i_mmap,
>> +					next * pages_per_huge_page(h),
>> +					(next + 1) * pages_per_huge_page(h));
>> +				i_mmap_unlock_write(mapping);
>> +			}
>> +
>> +			lock_page(page);
>> +			/*
>> +			 * We must free the huge page and remove from page
>> +			 * cache (remove_huge_page) BEFORE removing the
>> +			 * region/reserve map (hugetlb_unreserve_pages).  In
>> +			 * rare out of memory conditions, removal of the
>> +			 * region/reserve map could fail.  Before free'ing
>> +			 * the page, note PagePrivate which is used in case
>> +			 * of error.
>> +			 */
>> +			rsv_on_error = !PagePrivate(page);
>> +			remove_huge_page(page);
>> +			freed++;
>> +			if (!truncate_op) {
>> +				if (unlikely(hugetlb_unreserve_pages(inode,
>> +							next, next + 1, 1)))
>> +					hugetlb_fix_reserve_counts(inode,
>> +								rsv_on_error);
>>  			}
>>
>>  			unlock_page(page);
>> @@ -452,41 +494,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
>>  	clear_inode(inode);
>>  }
>>
>> -static inline void
>> -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
>> -{
>> -	struct vm_area_struct *vma;
>> -
>> -	/*
>> -	 * end == 0 indicates that the entire range after
>> -	 * start should be unmapped.
>> -	 */
>> -	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
>> -		unsigned long v_offset;
>> -
>> -		/*
>> -		 * Can the expression below overflow on 32-bit arches?
>> -		 * No, because the interval tree returns us only those vmas
>> -		 * which overlap the truncated area starting at pgoff,
>> -		 * and no vma on a 32-bit arch can span beyond the 4GB.
>> -		 */
>> -		if (vma->vm_pgoff < start)
>> -			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
>> -		else
>> -			v_offset = 0;
>> -
>> -		if (end) {
>> -			end = ((end - start) << PAGE_SHIFT) +
>> -			       vma->vm_start + v_offset;
>> -			if (end > vma->vm_end)
>> -				end = vma->vm_end;
>> -		} else
>> -			end = vma->vm_end;
>> -
>> -		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
>> -	}
>> -}
>> -
>>  static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
>>  {
>>  	pgoff_t pgoff;
>> --
>> 2.4.3
>>
> 
> --- a/fs/hugetlbfs/inode.c	Thu Jan  7 15:04:35 2016
> +++ b/fs/hugetlbfs/inode.c	Thu Jan  7 15:31:03 2016
> @@ -461,8 +461,11 @@ hugetlb_vmdelete_list(struct rb_root *ro
>  	 * end == 0 indicates that the entire range after
>  	 * start should be unmapped.
>  	 */
> -	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> +	if (!end)
> +		end = ULONG_MAX;
> +	vma_interval_tree_foreach(vma, root, start, end) {
>  		unsigned long v_offset;
> +		unsigned long v_end;
>  
>  		/*
>  		 * Can the expression below overflow on 32-bit arches?
> @@ -475,15 +478,12 @@ hugetlb_vmdelete_list(struct rb_root *ro
>  		else
>  			v_offset = 0;
>  
> -		if (end) {
> -			end = ((end - start) << PAGE_SHIFT) +
> +		v_end = ((end - start) << PAGE_SHIFT) +
>  			       vma->vm_start + v_offset;
> -			if (end > vma->vm_end)
> -				end = vma->vm_end;
> -		} else
> -			end = vma->vm_end;
> +		if (v_end > vma->vm_end)
> +			v_end = vma->vm_end;
>  
> -		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> +		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, NULL);
>  	}
>  }
>  
> --
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-07  8:06 ` Hillf Danton
  2016-01-07 16:46   ` Mike Kravetz
@ 2016-01-08  4:39   ` Mike Kravetz
  2016-01-08  6:25     ` Hillf Danton
  1 sibling, 1 reply; 12+ messages in thread
From: Mike Kravetz @ 2016-01-08  4:39 UTC (permalink / raw)
  To: Hillf Danton, linux-kernel, linux-mm
  Cc: 'Hugh Dickins', 'Naoya Horiguchi',
	'Davidlohr Bueso', 'Dave Hansen',
	'Andrew Morton', 'Michel Lespinasse'

On 01/07/2016 12:06 AM, Hillf Danton wrote:
>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>> index 0444760..0871d70 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
>>  	delete_from_page_cache(page);
>>  }
>>
>> +static inline void
>> +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
>> +{
>> +	struct vm_area_struct *vma;
>> +
>> +	/*
>> +	 * end == 0 indicates that the entire range after
>> +	 * start should be unmapped.
>> +	 */
>> +	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> 
> [1] perhaps end can be reused.
> 
>> +		unsigned long v_offset;
>> +
>> +		/*
>> +		 * Can the expression below overflow on 32-bit arches?
>> +		 * No, because the interval tree returns us only those vmas
>> +		 * which overlap the truncated area starting at pgoff,
>> +		 * and no vma on a 32-bit arch can span beyond the 4GB.
>> +		 */
>> +		if (vma->vm_pgoff < start)
>> +			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
>> +		else
>> +			v_offset = 0;
>> +
>> +		if (end) {
>> +			end = ((end - start) << PAGE_SHIFT) +
>> +			       vma->vm_start + v_offset;
> 
> [2] end is input to be pgoff_t, but changed to be the type of v_offset.
> Further we cannot handle the case that end is input to be zero.
> See the diff below please.
>
<snip>
> 
> --- a/fs/hugetlbfs/inode.c	Thu Jan  7 15:04:35 2016
> +++ b/fs/hugetlbfs/inode.c	Thu Jan  7 15:31:03 2016
> @@ -461,8 +461,11 @@ hugetlb_vmdelete_list(struct rb_root *ro
>  	 * end == 0 indicates that the entire range after
>  	 * start should be unmapped.
>  	 */
> -	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> +	if (!end)
> +		end = ULONG_MAX;
> +	vma_interval_tree_foreach(vma, root, start, end) {
>  		unsigned long v_offset;
> +		unsigned long v_end;
>  
>  		/*
>  		 * Can the expression below overflow on 32-bit arches?
> @@ -475,15 +478,12 @@ hugetlb_vmdelete_list(struct rb_root *ro
>  		else
>  			v_offset = 0;
>  
> -		if (end) {
> -			end = ((end - start) << PAGE_SHIFT) +
> +		v_end = ((end - start) << PAGE_SHIFT) +
>  			       vma->vm_start + v_offset;
> -			if (end > vma->vm_end)
> -				end = vma->vm_end;
> -		} else
> -			end = vma->vm_end;
> +		if (v_end > vma->vm_end)
> +			v_end = vma->vm_end;
>  
> -		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> +		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, NULL);
>  	}
>  }
>  
> --

Unfortunately, that calculation of v_end is not correct.  I know it
is based on the existing code, but the existing code it not correct.

I attempted to fix in a patch earlier today, but that was not correct
either.  Below is a proposed new version of hugetlb_vmdelete_list.
Let me know what you think.

static inline void
hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
{
	struct vm_area_struct *vma;

	/*
	 * end == 0 indicates that the entire range after
	 * start should be unmapped.
	 */
	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
		unsigned long v_offset;
		unsigned long v_end;

		/*
		 * Can the expression below overflow on 32-bit arches?
		 * No, because the interval tree returns us only those vmas
		 * which overlap the truncated area starting at pgoff,
		 * and no vma on a 32-bit arch can span beyond the 4GB.
		 */
		if (vma->vm_pgoff < start)
			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
		else
			v_offset = 0;

		if (!end)
			v_end = vma->vm_end;
		else {
			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
							+ vma->vm_start;
			if (v_end > vma->vm_end)
				v_end = vma->vm_end;
		}

		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
									NULL);
	}
}

-- 
Mike Kravetz

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-08  4:39   ` Mike Kravetz
@ 2016-01-08  6:25     ` Hillf Danton
  0 siblings, 0 replies; 12+ messages in thread
From: Hillf Danton @ 2016-01-08  6:25 UTC (permalink / raw)
  To: 'Mike Kravetz', linux-kernel, linux-mm
  Cc: 'Hugh Dickins', 'Naoya Horiguchi',
	'Davidlohr Bueso', 'Dave Hansen',
	'Andrew Morton', 'Michel Lespinasse'

> On 01/07/2016 12:06 AM, Hillf Danton wrote:
> >> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> >> index 0444760..0871d70 100644
> >> --- a/fs/hugetlbfs/inode.c
> >> +++ b/fs/hugetlbfs/inode.c
> >> @@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
> >>  	delete_from_page_cache(page);
> >>  }
> >>
> >> +static inline void
> >> +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
> >> +{
> >> +	struct vm_area_struct *vma;
> >> +
> >> +	/*
> >> +	 * end == 0 indicates that the entire range after
> >> +	 * start should be unmapped.
> >> +	 */
> >> +	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> >
> > [1] perhaps end can be reused.
> >
> >> +		unsigned long v_offset;
> >> +
> >> +		/*
> >> +		 * Can the expression below overflow on 32-bit arches?
> >> +		 * No, because the interval tree returns us only those vmas
> >> +		 * which overlap the truncated area starting at pgoff,
> >> +		 * and no vma on a 32-bit arch can span beyond the 4GB.
> >> +		 */
> >> +		if (vma->vm_pgoff < start)
> >> +			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
> >> +		else
> >> +			v_offset = 0;
> >> +
> >> +		if (end) {
> >> +			end = ((end - start) << PAGE_SHIFT) +
> >> +			       vma->vm_start + v_offset;
> >
> > [2] end is input to be pgoff_t, but changed to be the type of v_offset.
> > Further we cannot handle the case that end is input to be zero.
> > See the diff below please.
> >
> <snip>
> >
> > --- a/fs/hugetlbfs/inode.c	Thu Jan  7 15:04:35 2016
> > +++ b/fs/hugetlbfs/inode.c	Thu Jan  7 15:31:03 2016
> > @@ -461,8 +461,11 @@ hugetlb_vmdelete_list(struct rb_root *ro
> >  	 * end == 0 indicates that the entire range after
> >  	 * start should be unmapped.
> >  	 */
> > -	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> > +	if (!end)
> > +		end = ULONG_MAX;
> > +	vma_interval_tree_foreach(vma, root, start, end) {
> >  		unsigned long v_offset;
> > +		unsigned long v_end;
> >
> >  		/*
> >  		 * Can the expression below overflow on 32-bit arches?
> > @@ -475,15 +478,12 @@ hugetlb_vmdelete_list(struct rb_root *ro
> >  		else
> >  			v_offset = 0;
> >
> > -		if (end) {
> > -			end = ((end - start) << PAGE_SHIFT) +
> > +		v_end = ((end - start) << PAGE_SHIFT) +
> >  			       vma->vm_start + v_offset;
> > -			if (end > vma->vm_end)
> > -				end = vma->vm_end;
> > -		} else
> > -			end = vma->vm_end;
> > +		if (v_end > vma->vm_end)
> > +			v_end = vma->vm_end;
> >
> > -		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> > +		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, NULL);
> >  	}
> >  }
> >
> > --
> 
> Unfortunately, that calculation of v_end is not correct.  I know it
> is based on the existing code, but the existing code it not correct.
> 
> I attempted to fix in a patch earlier today, but that was not correct
> either.  Below is a proposed new version of hugetlb_vmdelete_list.

Thanks Mike.

> Let me know what you think.
> 
> static inline void
> hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
> {
> 	struct vm_area_struct *vma;
> 
> 	/*
> 	 * end == 0 indicates that the entire range after
> 	 * start should be unmapped.
> 	 */
> 	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> 		unsigned long v_offset;
> 		unsigned long v_end;
> 
> 		/*
> 		 * Can the expression below overflow on 32-bit arches?
> 		 * No, because the interval tree returns us only those vmas
> 		 * which overlap the truncated area starting at pgoff,
> 		 * and no vma on a 32-bit arch can span beyond the 4GB.
> 		 */
> 		if (vma->vm_pgoff < start)
> 			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
> 		else
> 			v_offset = 0;
> 
> 		if (!end)
> 			v_end = vma->vm_end;
> 		else {
> 			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
> 							+ vma->vm_start;
> 			if (v_end > vma->vm_end)
> 				v_end = vma->vm_end;
> 		}
> 
> 		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
> 									NULL);
> 	}
> }
> 
Looks good to me.

Hillf

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-06 22:37 [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch Mike Kravetz
  2016-01-07  8:06 ` Hillf Danton
@ 2016-01-11 22:35 ` Andrew Morton
  2016-01-11 23:38   ` Mike Kravetz
  1 sibling, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2016-01-11 22:35 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-kernel, linux-mm, Hugh Dickins, Naoya Horiguchi,
	Hillf Danton, Davidlohr Bueso, Dave Hansen

On Wed,  6 Jan 2016 14:37:04 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:

> Page faults can race with fallocate hole punch.  If a page fault happens
> between the unmap and remove operations, the page is not removed and
> remains within the hole.  This is not the desired behavior.  The race
> is difficult to detect in user level code as even in the non-race
> case, a page within the hole could be faulted back in before fallocate
> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
> this race will be easier to observe.
> 
> If this race is detected and a page is mapped, the remove operation
> (remove_inode_hugepages) will unmap the page before removing.  The unmap
> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
> so that no other faults will be processed until the page is removed.
> 
> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
> remove_inode_hugepages to satisfy the new reference.
> 
> ...
>
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
>
> ...
>
> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>  							mapping, next, 0);
>  			mutex_lock(&hugetlb_fault_mutex_table[hash]);
>  
> -			lock_page(page);
> -			if (likely(!page_mapped(page))) {

hm, what are the locking requirements for page_mapped()?

> -				bool rsv_on_error = !PagePrivate(page);
> -				/*
> -				 * We must free the huge page and remove
> -				 * from page cache (remove_huge_page) BEFORE
> -				 * removing the region/reserve map
> -				 * (hugetlb_unreserve_pages).  In rare out
> -				 * of memory conditions, removal of the
> -				 * region/reserve map could fail.  Before
> -				 * free'ing the page, note PagePrivate which
> -				 * is used in case of error.
> -				 */
> -				remove_huge_page(page);

And remove_huge_page().

> -				freed++;
> -				if (!truncate_op) {
> -					if (unlikely(hugetlb_unreserve_pages(
> -							inode, next,
> -							next + 1, 1)))
> -						hugetlb_fix_reserve_counts(
> -							inode, rsv_on_error);
> -				}
>
> ...
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-11 22:35 ` Andrew Morton
@ 2016-01-11 23:38   ` Mike Kravetz
  2016-01-12  0:29     ` Andrew Morton
  0 siblings, 1 reply; 12+ messages in thread
From: Mike Kravetz @ 2016-01-11 23:38 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Hugh Dickins, Naoya Horiguchi,
	Hillf Danton, Davidlohr Bueso, Dave Hansen

On 01/11/2016 02:35 PM, Andrew Morton wrote:
> On Wed,  6 Jan 2016 14:37:04 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
>> Page faults can race with fallocate hole punch.  If a page fault happens
>> between the unmap and remove operations, the page is not removed and
>> remains within the hole.  This is not the desired behavior.  The race
>> is difficult to detect in user level code as even in the non-race
>> case, a page within the hole could be faulted back in before fallocate
>> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
>> this race will be easier to observe.
>>
>> If this race is detected and a page is mapped, the remove operation
>> (remove_inode_hugepages) will unmap the page before removing.  The unmap
>> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
>> so that no other faults will be processed until the page is removed.
>>
>> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
>> remove_inode_hugepages to satisfy the new reference.
>>
>> ...
>>
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>>
>> ...
>>
>> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
>>  							mapping, next, 0);
>>  			mutex_lock(&hugetlb_fault_mutex_table[hash]);
>>  
>> -			lock_page(page);
>> -			if (likely(!page_mapped(page))) {
> 
> hm, what are the locking requirements for page_mapped()?

page_mapped is just reading/evaluating an atomic within the struct page
which we have a referene on from the pagevec_lookup.  But, I think the
question is what prevents page_mapped from changing after we check it?

The patch takes the hugetlb_fault_mutex_table lock before checking
page_mapped.  If the page is unmapped and the hugetlb_fault_mutex_table
is held, it can not be faulted in and change from unmapped to mapped.

The new comment in the patch about taking hugetlb_fault_mutex_table is
right before the check for page_mapped.

> 
>> -				bool rsv_on_error = !PagePrivate(page);
>> -				/*
>> -				 * We must free the huge page and remove
>> -				 * from page cache (remove_huge_page) BEFORE
>> -				 * removing the region/reserve map
>> -				 * (hugetlb_unreserve_pages).  In rare out
>> -				 * of memory conditions, removal of the
>> -				 * region/reserve map could fail.  Before
>> -				 * free'ing the page, note PagePrivate which
>> -				 * is used in case of error.
>> -				 */
>> -				remove_huge_page(page);
> 
> And remove_huge_page().

The page must be locked before calling remove_huge_page, since it will
call delete_from_page_cache.  It currently is locked.  Would you prefer
a comment stating this before the call?

-- 
Mike Kravetz

> 
>> -				freed++;
>> -				if (!truncate_op) {
>> -					if (unlikely(hugetlb_unreserve_pages(
>> -							inode, next,
>> -							next + 1, 1)))
>> -						hugetlb_fix_reserve_counts(
>> -							inode, rsv_on_error);
>> -				}
>>
>> ...
>>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-11 23:38   ` Mike Kravetz
@ 2016-01-12  0:29     ` Andrew Morton
  2016-01-12  1:36       ` Mike Kravetz
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2016-01-12  0:29 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-kernel, linux-mm, Hugh Dickins, Naoya Horiguchi,
	Hillf Danton, Davidlohr Bueso, Dave Hansen

On Mon, 11 Jan 2016 15:38:40 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:

> On 01/11/2016 02:35 PM, Andrew Morton wrote:
> > On Wed,  6 Jan 2016 14:37:04 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > 
> >> Page faults can race with fallocate hole punch.  If a page fault happens
> >> between the unmap and remove operations, the page is not removed and
> >> remains within the hole.  This is not the desired behavior.  The race
> >> is difficult to detect in user level code as even in the non-race
> >> case, a page within the hole could be faulted back in before fallocate
> >> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
> >> this race will be easier to observe.
> >>
> >> If this race is detected and a page is mapped, the remove operation
> >> (remove_inode_hugepages) will unmap the page before removing.  The unmap
> >> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
> >> so that no other faults will be processed until the page is removed.
> >>
> >> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
> >> remove_inode_hugepages to satisfy the new reference.
> >>
> >> ...
> >>
> >> --- a/fs/hugetlbfs/inode.c
> >> +++ b/fs/hugetlbfs/inode.c
> >>
> >> ...
> >>
> >> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
> >>  							mapping, next, 0);
> >>  			mutex_lock(&hugetlb_fault_mutex_table[hash]);
> >>  
> >> -			lock_page(page);
> >> -			if (likely(!page_mapped(page))) {
> > 
> > hm, what are the locking requirements for page_mapped()?
> 
> page_mapped is just reading/evaluating an atomic within the struct page
> which we have a referene on from the pagevec_lookup.  But, I think the
> question is what prevents page_mapped from changing after we check it?
> 
> The patch takes the hugetlb_fault_mutex_table lock before checking
> page_mapped.  If the page is unmapped and the hugetlb_fault_mutex_table
> is held, it can not be faulted in and change from unmapped to mapped.
> 
> The new comment in the patch about taking hugetlb_fault_mutex_table is
> right before the check for page_mapped.

OK, thanks.

> > 
> >> -				bool rsv_on_error = !PagePrivate(page);
> >> -				/*
> >> -				 * We must free the huge page and remove
> >> -				 * from page cache (remove_huge_page) BEFORE
> >> -				 * removing the region/reserve map
> >> -				 * (hugetlb_unreserve_pages).  In rare out
> >> -				 * of memory conditions, removal of the
> >> -				 * region/reserve map could fail.  Before
> >> -				 * free'ing the page, note PagePrivate which
> >> -				 * is used in case of error.
> >> -				 */
> >> -				remove_huge_page(page);
> > 
> > And remove_huge_page().
> 
> The page must be locked before calling remove_huge_page, since it will
> call delete_from_page_cache.  It currently is locked.  Would you prefer
> a comment stating this before the call?

No, that doesn't seem nevessary.

I'll mark this patch as "pending, awaiting Mike's go-ahead".

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-12  0:29     ` Andrew Morton
@ 2016-01-12  1:36       ` Mike Kravetz
  2016-01-12  2:20         ` Andrew Morton
  0 siblings, 1 reply; 12+ messages in thread
From: Mike Kravetz @ 2016-01-12  1:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Hugh Dickins, Naoya Horiguchi,
	Hillf Danton, Davidlohr Bueso, Dave Hansen

On 01/11/2016 04:29 PM, Andrew Morton wrote:
> On Mon, 11 Jan 2016 15:38:40 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
>> On 01/11/2016 02:35 PM, Andrew Morton wrote:
>>> On Wed,  6 Jan 2016 14:37:04 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:

<snip>

>>>> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
>>>> remove_inode_hugepages to satisfy the new reference.
>>>>

<snip>

> 
> I'll mark this patch as "pending, awaiting Mike's go-ahead".
> 

When this patch was originally submitted, bugs were discovered in the
hugetlb_vmdelete_list routine.  So, the patch "Fix bugs in
hugetlb_vmtruncate_list" was created.

I have retested the changes in this patch specifically dealing with
page fault/hole punch race on top of the new hugetlb_vmtruncate_list
routine.  Everything looks good.

How would you like to proceed with the patch?
- Should I create a series with the hugetlb_vmtruncate_list split out?
- Should I respin with hugetlb_vmtruncate_list patch applied?

Just let me know what is easiest/best for you.
-- 
Mike Kravetz

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-12  1:36       ` Mike Kravetz
@ 2016-01-12  2:20         ` Andrew Morton
  2016-01-12  3:21           ` Mike Kravetz
  0 siblings, 1 reply; 12+ messages in thread
From: Andrew Morton @ 2016-01-12  2:20 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-kernel, linux-mm, Hugh Dickins, Naoya Horiguchi,
	Hillf Danton, Davidlohr Bueso, Dave Hansen

On Mon, 11 Jan 2016 17:36:43 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:

> > 
> > I'll mark this patch as "pending, awaiting Mike's go-ahead".
> > 
> 
> When this patch was originally submitted, bugs were discovered in the
> hugetlb_vmdelete_list routine.  So, the patch "Fix bugs in
> hugetlb_vmtruncate_list" was created.
> 
> I have retested the changes in this patch specifically dealing with
> page fault/hole punch race on top of the new hugetlb_vmtruncate_list
> routine.  Everything looks good.
> 
> How would you like to proceed with the patch?
> - Should I create a series with the hugetlb_vmtruncate_list split out?
> - Should I respin with hugetlb_vmtruncate_list patch applied?
> 
> Just let me know what is easiest/best for you.

If you're saying that
http://ozlabs.org/~akpm/mmots/broken-out/mm-mempolicy-skip-non-migratable-vmas-when-setting-mpol_mf_lazy.patch
and
http://ozlabs.org/~akpm/mmots/broken-out/mm-hugetlbfs-unmap-pages-if-page-fault-raced-with-hole-punch.patch
are the final everything-works versions then we're all good to go now.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-12  2:20         ` Andrew Morton
@ 2016-01-12  3:21           ` Mike Kravetz
  2016-01-12  4:35             ` Andrew Morton
  0 siblings, 1 reply; 12+ messages in thread
From: Mike Kravetz @ 2016-01-12  3:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Hugh Dickins, Naoya Horiguchi,
	Hillf Danton, Davidlohr Bueso, Dave Hansen

On 01/11/2016 06:20 PM, Andrew Morton wrote:
> On Mon, 11 Jan 2016 17:36:43 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
>>>
>>> I'll mark this patch as "pending, awaiting Mike's go-ahead".
>>>
>>
>> When this patch was originally submitted, bugs were discovered in the
>> hugetlb_vmdelete_list routine.  So, the patch "Fix bugs in
>> hugetlb_vmtruncate_list" was created.
>>
>> I have retested the changes in this patch specifically dealing with
>> page fault/hole punch race on top of the new hugetlb_vmtruncate_list
>> routine.  Everything looks good.
>>
>> How would you like to proceed with the patch?
>> - Should I create a series with the hugetlb_vmtruncate_list split out?
>> - Should I respin with hugetlb_vmtruncate_list patch applied?
>>
>> Just let me know what is easiest/best for you.
> 
> If you're saying that
> http://ozlabs.org/~akpm/mmots/broken-out/mm-mempolicy-skip-non-migratable-vmas-when-setting-mpol_mf_lazy.patch

That should be,
http://ozlabs.org/~akpm/mmots/broken-out/mm-hugetlbfs-fix-bugs-in-hugetlb_vmtruncate_list.patch

> and
> http://ozlabs.org/~akpm/mmots/broken-out/mm-hugetlbfs-unmap-pages-if-page-fault-raced-with-hole-punch.patch
> are the final everything-works versions then we're all good to go now.
> 

The only thing that 'might' be an issue is the new reference to
hugetlb_vmdelete_list() from remove_inode_hugepages().
hugetlb_vmdelete_list() was after remove_inode_hugepages() in the source
file.

The original patch moved hugetlb_vmdelete_list() to satisfy the new
reference.  I can not tell if that was taken into account in the way the
patches were pulled into your tree.  Will certainly know when it comes
time to build.

-- 
Mike Kravetz

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch
  2016-01-12  3:21           ` Mike Kravetz
@ 2016-01-12  4:35             ` Andrew Morton
  0 siblings, 0 replies; 12+ messages in thread
From: Andrew Morton @ 2016-01-12  4:35 UTC (permalink / raw)
  To: Mike Kravetz
  Cc: linux-kernel, linux-mm, Hugh Dickins, Naoya Horiguchi,
	Hillf Danton, Davidlohr Bueso, Dave Hansen

On Mon, 11 Jan 2016 19:21:15 -0800 Mike Kravetz <mike.kravetz@oracle.com> wrote:

> >> Just let me know what is easiest/best for you.
> > 
> > If you're saying that
> > http://ozlabs.org/~akpm/mmots/broken-out/mm-mempolicy-skip-non-migratable-vmas-when-setting-mpol_mf_lazy.patch
> 
> That should be,
> http://ozlabs.org/~akpm/mmots/broken-out/mm-hugetlbfs-fix-bugs-in-hugetlb_vmtruncate_list.patch

yup.

> > and
> > http://ozlabs.org/~akpm/mmots/broken-out/mm-hugetlbfs-unmap-pages-if-page-fault-raced-with-hole-punch.patch
> > are the final everything-works versions then we're all good to go now.
> > 
> 
> The only thing that 'might' be an issue is the new reference to
> hugetlb_vmdelete_list() from remove_inode_hugepages().
> hugetlb_vmdelete_list() was after remove_inode_hugepages() in the source
> file.
> 
> The original patch moved hugetlb_vmdelete_list() to satisfy the new
> reference.  I can not tell if that was taken into account in the way the
> patches were pulled into your tree.  Will certainly know when it comes
> time to build.

um, yes.

--- a/fs/hugetlbfs/inode.c~mm-hugetlbfs-unmap-pages-if-page-fault-raced-with-hole-punch-fix
+++ a/fs/hugetlbfs/inode.c
@@ -324,6 +324,44 @@ static void remove_huge_page(struct page
 	delete_from_page_cache(page);
 }
 
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+	struct vm_area_struct *vma;
+
+	/*
+	 * end == 0 indicates that the entire range after
+	 * start should be unmapped.
+	 */
+	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+		unsigned long v_offset;
+		unsigned long v_end;
+
+		/*
+		 * Can the expression below overflow on 32-bit arches?
+		 * No, because the interval tree returns us only those vmas
+		 * which overlap the truncated area starting at pgoff,
+		 * and no vma on a 32-bit arch can span beyond the 4GB.
+		 */
+		if (vma->vm_pgoff < start)
+			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+		else
+			v_offset = 0;
+
+		if (!end)
+			v_end = vma->vm_end;
+		else {
+			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+							+ vma->vm_start;
+			if (v_end > vma->vm_end)
+				v_end = vma->vm_end;
+		}
+
+		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+									NULL);
+	}
+}
+
 /*
  * remove_inode_hugepages handles two distinct cases: truncation and hole
  * punch.  There are subtle differences in operation for each case.
@@ -458,44 +496,6 @@ static void hugetlbfs_evict_inode(struct
 	clear_inode(inode);
 }
 
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
-	struct vm_area_struct *vma;
-
-	/*
-	 * end == 0 indicates that the entire range after
-	 * start should be unmapped.
-	 */
-	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
-		unsigned long v_offset;
-		unsigned long v_end;
-
-		/*
-		 * Can the expression below overflow on 32-bit arches?
-		 * No, because the interval tree returns us only those vmas
-		 * which overlap the truncated area starting at pgoff,
-		 * and no vma on a 32-bit arch can span beyond the 4GB.
-		 */
-		if (vma->vm_pgoff < start)
-			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
-		else
-			v_offset = 0;
-
-		if (!end)
-			v_end = vma->vm_end;
-		else {
-			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
-							+ vma->vm_start;
-			if (v_end > vma->vm_end)
-				v_end = vma->vm_end;
-		}
-
-		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
-									NULL);
-	}
-}
-
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
_

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2016-01-12  4:36 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-01-06 22:37 [PATCH] mm/hugetlbfs: Unmap pages if page fault raced with hole punch Mike Kravetz
2016-01-07  8:06 ` Hillf Danton
2016-01-07 16:46   ` Mike Kravetz
2016-01-08  4:39   ` Mike Kravetz
2016-01-08  6:25     ` Hillf Danton
2016-01-11 22:35 ` Andrew Morton
2016-01-11 23:38   ` Mike Kravetz
2016-01-12  0:29     ` Andrew Morton
2016-01-12  1:36       ` Mike Kravetz
2016-01-12  2:20         ` Andrew Morton
2016-01-12  3:21           ` Mike Kravetz
2016-01-12  4:35             ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).