Re: [RFC PATCH 3/4] hugetlbfs: add hugetlbfs

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* Re: [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate()
       [not found] <00fc01d078e3$63428ec0$29c7ac40$@alibaba-inc.com>
@ 2015-04-17  8:00 ` Hillf Danton
  2015-04-17 17:14   ` Mike Kravetz
  0 siblings, 1 reply; 3+ messages in thread
From: Hillf Danton @ 2015-04-17  8:00 UTC (permalink / raw)
  To: Dave Hansen, Mike Kravetz; +Cc: linux-kernel, linux-mm

> 
> This is based on the shmem version, but it has diverged quite
> a bit.  We have no swap to worry about, nor the new file sealing.
> 
> What this allows us to do is move physical memory in and out of
> a hugetlbfs file without having it mapped.  This also gives us
> the ability to support MADV_REMOVE since it is currently
> implemented using fallocate().  MADV_REMOVE lets us remove data
> from the middle of a hugetlbfs file, which wasn't possible before.
> 
> hugetlbfs fallocate only operates on whole huge pages.
> 
> Based-on code-by: Dave Hansen <dave.hansen@linux.intel.com>
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  fs/hugetlbfs/inode.c    | 139 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/hugetlb.h |   3 ++
>  mm/hugetlb.c            |   2 +-
>  3 files changed, 143 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index d5b67fd..6d48c8f 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -12,6 +12,7 @@
>  #include <linux/thread_info.h>
>  #include <asm/current.h>
>  #include <linux/sched.h>		/* remove ASAP */
> +#include <linux/falloc.h>
>  #include <linux/fs.h>
>  #include <linux/mount.h>
>  #include <linux/file.h>
> @@ -377,6 +378,143 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart, loff_t lend)
>  	hugetlb_unreserve_pages(inode, start, freed);
>  }
> 
> +static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +{
> +	struct hstate *h = hstate_inode(inode);
> +	unsigned long hpage_size = huge_page_size(h);
> +	loff_t hole_start, hole_end;
> +
> +	/*
> +	 * For hole punch round up the beginning offset of the hole and
> +	 * round down the end.
> +	 */
> +	hole_start = (offset + hpage_size - 1) & ~huge_page_mask(h);
> +	hole_end = (offset + len - (hpage_size - 1)) * ~huge_page_mask(h);
> +
> +	if ((u64)hole_end > (u64)hole_start) {
> +		struct address_space *mapping = &inode->i_data;
> +
> +		mutex_lock(&inode->i_mutex);
> +		unmap_mapping_range(mapping, hole_start, hole_end, 0);
> +		truncate_hugepages(inode, hole_start, hole_end);
> +		mutex_unlock(&inode->i_mutex);
> +	}
> +
> +	return 0;
> +}
> +
> +static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
> +				loff_t len)
> +{
> +	struct inode *inode = file_inode(file);
> +	struct address_space *mapping = inode->i_mapping;
> +	struct hstate *h = hstate_inode(inode);
> +	struct vm_area_struct pseudo_vma;
> +	unsigned long hpage_size = huge_page_size(h);
> +	unsigned long hpage_shift = huge_page_shift(h);
> +	pgoff_t start, index, end;
> +	unsigned long addr;
> +	int error;
> +
> +	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> +		return -EOPNOTSUPP;
> +
> +	if (mode & FALLOC_FL_PUNCH_HOLE)
> +		return hugetlbfs_punch_hole(inode, offset, len);
> +
> +	/*
> +	 * Default preallocate case.
> +	 * For this range, start is rounded down and end is rounded up.
> +	 */
> +	start = offset >> hpage_shift;
> +	end = (offset + len + hpage_size - 1) >> hpage_shift;
> +
> +	mutex_lock(&inode->i_mutex);
> +
> +	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
> +	error = inode_newsize_ok(inode, offset + len);
> +	if (error)
> +		goto out;
> +
> +	/*
> +	 * Initialize a pseudo vma that just contains the policy used
> +	 * when allocating the huge pages.  The actual policy field
> +	 * (vm_policy) is determined based on the index in the loop below.
> +	 */
> +	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
> +	pseudo_vma.vm_start = 0;
> +	pseudo_vma.vm_flags |= (VM_HUGETLB | VM_MAYSHARE);
> +	pseudo_vma.vm_file = file;
> +
> +	/* addr is the offset within the file (zero based) */
> +	addr = start * hpage_size;
> +	for (index = start; index < end; index++) {
> +		/*
> +		 * This is supposed to be the vaddr where the page is being
> +		 * faulted in, but we have no vaddr here.
> +		 */
> +		struct page *page;
> +		int avoid_reserve = 1;
> +
> +		cond_resched();
> +
> +		/*
> +		 * fallocate(2) manpage permits EINTR; we may have been
> +		 * interrupted because we are using up too much memory.
> +		 */
> +		if (signal_pending(current)) {
> +			error = -EINTR;
> +			break;
> +		}
> +		page = find_get_page(mapping, index);
> +		if (page) {
> +			put_page(page);
> +			continue;
> +		}
> +
> +		/* Get policy based on index */
> +		pseudo_vma.vm_policy =
> +			mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
> +							index);
> +
> +		page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
> +		mpol_cond_put(pseudo_vma.vm_policy);
> +		if (IS_ERR(page)) {
> +			error = PTR_ERR(page);
> +			goto out;
> +		}
> +		clear_huge_page(page, addr, pages_per_huge_page(h));
> +		__SetPageUptodate(page);
> +		error = huge_add_to_page_cache(page, mapping, index);
> +		if (error) {
> +			put_page(page);
> +			/* Keep going if we see an -EEXIST */
> +			if (error != -EEXIST)
> +				goto out;  /* FIXME, need to free? */
> +		}
> +
> +		/*
> +		 * page_put due to reference from alloc_huge_page()
> +		 * unlock_page because locked by add_to_page_cache()
> +		 */
> +		put_page(page);

Still needed if EEXIST?

> +		unlock_page(page);
> +
> +		/* Increment addr for next huge page */
> +		addr += hpage_size;
> +	}
> +
> +	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
> +		i_size_write(inode, offset + len);
> +	inode->i_ctime = CURRENT_TIME;
> +	spin_lock(&inode->i_lock);
> +	inode->i_private = NULL;
> +	spin_unlock(&inode->i_lock);
> +out:
> +	mutex_unlock(&inode->i_mutex);
> +	return error;
> +}
> +
>  static void hugetlbfs_evict_inode(struct inode *inode)
>  {
>  	struct resv_map *resv_map;
> @@ -743,6 +881,7 @@ const struct file_operations hugetlbfs_file_operations = {
>  	.fsync			= noop_fsync,
>  	.get_unmapped_area	= hugetlb_get_unmapped_area,
>  	.llseek		= default_llseek,
> +	.fallocate		= hugetlbfs_fallocate,
>  };
> 
>  static const struct inode_operations hugetlbfs_dir_inode_operations = {
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 6425945..d96b88e 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -322,6 +322,8 @@ struct huge_bootmem_page {
>  #endif
>  };
> 
> +struct page *alloc_huge_page(struct vm_area_struct *vma,
> +				unsigned long addr, int avoid_reserve);
>  struct page *alloc_huge_page_node(struct hstate *h, int nid);
>  struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
>  				unsigned long addr, int avoid_reserve);
> @@ -476,6 +478,7 @@ static inline bool hugepages_supported(void)
> 
>  #else	/* CONFIG_HUGETLB_PAGE */
>  struct hstate {};
> +#define alloc_huge_page(v, a, r) NULL
>  #define alloc_huge_page_node(h, nid) NULL
>  #define alloc_huge_page_noerr(v, a, r) NULL
>  #define alloc_bootmem_huge_page(h) NULL
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 7cda328..e130c6d 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1363,7 +1363,7 @@ static void vma_commit_reservation(struct hstate *h,
>  	region_add(resv, idx, idx + 1);
>  }
> 
> -static struct page *alloc_huge_page(struct vm_area_struct *vma,
> +struct page *alloc_huge_page(struct vm_area_struct *vma,
>  				    unsigned long addr, int avoid_reserve)
>  {
>  	struct hugepage_subpool *spool = subpool_vma(vma);
> --
> 2.1.0
> 
> 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate()
  2015-04-17  8:00 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Hillf Danton
@ 2015-04-17 17:14   ` Mike Kravetz
  0 siblings, 0 replies; 3+ messages in thread
From: Mike Kravetz @ 2015-04-17 17:14 UTC (permalink / raw)
  To: Hillf Danton, Dave Hansen; +Cc: linux-kernel, linux-mm

On 04/17/2015 01:00 AM, Hillf Danton wrote:
>> +		clear_huge_page(page, addr, pages_per_huge_page(h));
>> +		__SetPageUptodate(page);
>> +		error = huge_add_to_page_cache(page, mapping, index);
>> +		if (error) {
>> +			put_page(page);
>> +			/* Keep going if we see an -EEXIST */
>> +			if (error != -EEXIST)
>> +				goto out;  /* FIXME, need to free? */
>> +		}
>> +
>> +		/*
>> +		 * page_put due to reference from alloc_huge_page()
>> +		 * unlock_page because locked by add_to_page_cache()
>> +		 */
>> +		put_page(page);
>
> Still needed if EEXIST?

Nope.  Good catch.

I'll fix this in the next version.
-- 
Mike Kravetz

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [RFC PATCH 0/4] hugetlbfs: add fallocate support
@ 2015-04-16 23:02 Mike Kravetz
  2015-04-16 23:02 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Mike Kravetz
  0 siblings, 1 reply; 3+ messages in thread
From: Mike Kravetz @ 2015-04-16 23:02 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Dave Hansen, Naoya Horiguchi, David Rientjes, Hugh Dickins,
	Davidlohr Bueso, Aneesh Kumar, Mike Kravetz

hugetlbfs is used today by applications that want a high degree of
control over huge page usage.  Often, large hugetlbfs files are used
to map a large number huge pages into the application processes.
The applications know when page ranges within these large files will
no longer be used, and ideally would like to release them back to
the subpool or global pools for other uses.  The fallocate() system
call provides an interface for preallocation and hole punching within
files.  This patch set adds fallocate functionality to hugetlbfs.

Mike Kravetz (4):
  hugetlbfs: truncate_hugepages() takes a range of pages
  hugetlbfs: New huge_add_to_page_cache helper routine
  hugetlbfs: add hugetlbfs_fallocate()
  mm: madvise allow remove operation for hugetlbfs

 fs/hugetlbfs/inode.c    | 164 ++++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/hugetlb.h |   5 ++
 mm/hugetlb.c            |  29 ++++++---
 mm/madvise.c            |   2 +-
 4 files changed, 185 insertions(+), 15 deletions(-)

-- 
2.1.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate()
  2015-04-16 23:02 [RFC PATCH 0/4] hugetlbfs: add fallocate support Mike Kravetz
@ 2015-04-16 23:02 ` Mike Kravetz
  0 siblings, 0 replies; 3+ messages in thread
From: Mike Kravetz @ 2015-04-16 23:02 UTC (permalink / raw)
  To: linux-mm, linux-kernel
  Cc: Dave Hansen, Naoya Horiguchi, David Rientjes, Hugh Dickins,
	Davidlohr Bueso, Aneesh Kumar, Mike Kravetz

This is based on the shmem version, but it has diverged quite
a bit.  We have no swap to worry about, nor the new file sealing.

What this allows us to do is move physical memory in and out of
a hugetlbfs file without having it mapped.  This also gives us
the ability to support MADV_REMOVE since it is currently
implemented using fallocate().  MADV_REMOVE lets us remove data
from the middle of a hugetlbfs file, which wasn't possible before.

hugetlbfs fallocate only operates on whole huge pages.

Based-on code-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/hugetlbfs/inode.c    | 139 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/hugetlb.h |   3 ++
 mm/hugetlb.c            |   2 +-
 3 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d5b67fd..6d48c8f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
 #include <linux/thread_info.h>
 #include <asm/current.h>
 #include <linux/sched.h>		/* remove ASAP */
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
@@ -377,6 +378,143 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart, loff_t lend)
 	hugetlb_unreserve_pages(inode, start, freed);
 }
 
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct hstate *h = hstate_inode(inode);
+	unsigned long hpage_size = huge_page_size(h);
+	loff_t hole_start, hole_end;
+
+	/*
+	 * For hole punch round up the beginning offset of the hole and
+	 * round down the end.
+	 */
+	hole_start = (offset + hpage_size - 1) & ~huge_page_mask(h);
+	hole_end = (offset + len - (hpage_size - 1)) * ~huge_page_mask(h);
+
+	if ((u64)hole_end > (u64)hole_start) {
+		struct address_space *mapping = &inode->i_data;
+
+		mutex_lock(&inode->i_mutex);
+		unmap_mapping_range(mapping, hole_start, hole_end, 0);
+		truncate_hugepages(inode, hole_start, hole_end);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+				loff_t len)
+{
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
+	struct vm_area_struct pseudo_vma;
+	unsigned long hpage_size = huge_page_size(h);
+	unsigned long hpage_shift = huge_page_shift(h);
+	pgoff_t start, index, end;
+	unsigned long addr;
+	int error;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return hugetlbfs_punch_hole(inode, offset, len);
+
+	/*
+	 * Default preallocate case.
+	 * For this range, start is rounded down and end is rounded up.
+	 */
+	start = offset >> hpage_shift;
+	end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+	mutex_lock(&inode->i_mutex);
+
+	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+	error = inode_newsize_ok(inode, offset + len);
+	if (error)
+		goto out;
+
+	/*
+	 * Initialize a pseudo vma that just contains the policy used
+	 * when allocating the huge pages.  The actual policy field
+	 * (vm_policy) is determined based on the index in the loop below.
+	 */
+	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+	pseudo_vma.vm_start = 0;
+	pseudo_vma.vm_flags |= (VM_HUGETLB | VM_MAYSHARE);
+	pseudo_vma.vm_file = file;
+
+	/* addr is the offset within the file (zero based) */
+	addr = start * hpage_size;
+	for (index = start; index < end; index++) {
+		/*
+		 * This is supposed to be the vaddr where the page is being
+		 * faulted in, but we have no vaddr here.
+		 */
+		struct page *page;
+		int avoid_reserve = 1;
+
+		cond_resched();
+
+		/*
+		 * fallocate(2) manpage permits EINTR; we may have been
+		 * interrupted because we are using up too much memory.
+		 */
+		if (signal_pending(current)) {
+			error = -EINTR;
+			break;
+		}
+		page = find_get_page(mapping, index);
+		if (page) {
+			put_page(page);
+			continue;
+		}
+
+		/* Get policy based on index */
+		pseudo_vma.vm_policy =
+			mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+							index);
+
+		page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+		mpol_cond_put(pseudo_vma.vm_policy);
+		if (IS_ERR(page)) {
+			error = PTR_ERR(page);
+			goto out;
+		}
+		clear_huge_page(page, addr, pages_per_huge_page(h));
+		__SetPageUptodate(page);
+		error = huge_add_to_page_cache(page, mapping, index);
+		if (error) {
+			put_page(page);
+			/* Keep going if we see an -EEXIST */
+			if (error != -EEXIST)
+				goto out;  /* FIXME, need to free? */
+		}
+
+		/*
+		 * page_put due to reference from alloc_huge_page()
+		 * unlock_page because locked by add_to_page_cache()
+		 */
+		put_page(page);
+		unlock_page(page);
+
+		/* Increment addr for next huge page */
+		addr += hpage_size;
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+		i_size_write(inode, offset + len);
+	inode->i_ctime = CURRENT_TIME;
+	spin_lock(&inode->i_lock);
+	inode->i_private = NULL;
+	spin_unlock(&inode->i_lock);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
 	struct resv_map *resv_map;
@@ -743,6 +881,7 @@ const struct file_operations hugetlbfs_file_operations = {
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
 	.llseek		= default_llseek,
+	.fallocate		= hugetlbfs_fallocate,
 };
 
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6425945..d96b88e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -322,6 +322,8 @@ struct huge_bootmem_page {
 #endif
 };
 
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+				unsigned long addr, int avoid_reserve);
 struct page *alloc_huge_page_node(struct hstate *h, int nid);
 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
@@ -476,6 +478,7 @@ static inline bool hugepages_supported(void)
 
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
 #define alloc_huge_page_node(h, nid) NULL
 #define alloc_huge_page_noerr(v, a, r) NULL
 #define alloc_bootmem_huge_page(h) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7cda328..e130c6d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1363,7 +1363,7 @@ static void vma_commit_reservation(struct hstate *h,
 	region_add(resv, idx, idx + 1);
 }
 
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
 	struct hugepage_subpool *spool = subpool_vma(vma);
-- 
2.1.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-04-17 17:14 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <00fc01d078e3$63428ec0$29c7ac40$@alibaba-inc.com>
2015-04-17  8:00 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Hillf Danton
2015-04-17 17:14   ` Mike Kravetz
2015-04-16 23:02 [RFC PATCH 0/4] hugetlbfs: add fallocate support Mike Kravetz
2015-04-16 23:02 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Mike Kravetz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).