* Re: [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() [not found] <00fc01d078e3$63428ec0$29c7ac40$@alibaba-inc.com> @ 2015-04-17 8:00 ` Hillf Danton 2015-04-17 17:14 ` Mike Kravetz 0 siblings, 1 reply; 3+ messages in thread From: Hillf Danton @ 2015-04-17 8:00 UTC (permalink / raw) To: Dave Hansen, Mike Kravetz; +Cc: linux-kernel, linux-mm > > This is based on the shmem version, but it has diverged quite > a bit. We have no swap to worry about, nor the new file sealing. > > What this allows us to do is move physical memory in and out of > a hugetlbfs file without having it mapped. This also gives us > the ability to support MADV_REMOVE since it is currently > implemented using fallocate(). MADV_REMOVE lets us remove data > from the middle of a hugetlbfs file, which wasn't possible before. > > hugetlbfs fallocate only operates on whole huge pages. > > Based-on code-by: Dave Hansen <dave.hansen@linux.intel.com> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> > --- > fs/hugetlbfs/inode.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/hugetlb.h | 3 ++ > mm/hugetlb.c | 2 +- > 3 files changed, 143 insertions(+), 1 deletion(-) > > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c > index d5b67fd..6d48c8f 100644 > --- a/fs/hugetlbfs/inode.c > +++ b/fs/hugetlbfs/inode.c > @@ -12,6 +12,7 @@ > #include <linux/thread_info.h> > #include <asm/current.h> > #include <linux/sched.h> /* remove ASAP */ > +#include <linux/falloc.h> > #include <linux/fs.h> > #include <linux/mount.h> > #include <linux/file.h> > @@ -377,6 +378,143 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart, loff_t lend) > hugetlb_unreserve_pages(inode, start, freed); > } > > +static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) > +{ > + struct hstate *h = hstate_inode(inode); > + unsigned long hpage_size = huge_page_size(h); > + loff_t hole_start, hole_end; > + > + /* > + * For hole punch round up the beginning offset of the hole and > + * round down the end. > + */ > + hole_start = (offset + hpage_size - 1) & ~huge_page_mask(h); > + hole_end = (offset + len - (hpage_size - 1)) * ~huge_page_mask(h); > + > + if ((u64)hole_end > (u64)hole_start) { > + struct address_space *mapping = &inode->i_data; > + > + mutex_lock(&inode->i_mutex); > + unmap_mapping_range(mapping, hole_start, hole_end, 0); > + truncate_hugepages(inode, hole_start, hole_end); > + mutex_unlock(&inode->i_mutex); > + } > + > + return 0; > +} > + > +static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, > + loff_t len) > +{ > + struct inode *inode = file_inode(file); > + struct address_space *mapping = inode->i_mapping; > + struct hstate *h = hstate_inode(inode); > + struct vm_area_struct pseudo_vma; > + unsigned long hpage_size = huge_page_size(h); > + unsigned long hpage_shift = huge_page_shift(h); > + pgoff_t start, index, end; > + unsigned long addr; > + int error; > + > + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) > + return -EOPNOTSUPP; > + > + if (mode & FALLOC_FL_PUNCH_HOLE) > + return hugetlbfs_punch_hole(inode, offset, len); > + > + /* > + * Default preallocate case. > + * For this range, start is rounded down and end is rounded up. > + */ > + start = offset >> hpage_shift; > + end = (offset + len + hpage_size - 1) >> hpage_shift; > + > + mutex_lock(&inode->i_mutex); > + > + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ > + error = inode_newsize_ok(inode, offset + len); > + if (error) > + goto out; > + > + /* > + * Initialize a pseudo vma that just contains the policy used > + * when allocating the huge pages. The actual policy field > + * (vm_policy) is determined based on the index in the loop below. > + */ > + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); > + pseudo_vma.vm_start = 0; > + pseudo_vma.vm_flags |= (VM_HUGETLB | VM_MAYSHARE); > + pseudo_vma.vm_file = file; > + > + /* addr is the offset within the file (zero based) */ > + addr = start * hpage_size; > + for (index = start; index < end; index++) { > + /* > + * This is supposed to be the vaddr where the page is being > + * faulted in, but we have no vaddr here. > + */ > + struct page *page; > + int avoid_reserve = 1; > + > + cond_resched(); > + > + /* > + * fallocate(2) manpage permits EINTR; we may have been > + * interrupted because we are using up too much memory. > + */ > + if (signal_pending(current)) { > + error = -EINTR; > + break; > + } > + page = find_get_page(mapping, index); > + if (page) { > + put_page(page); > + continue; > + } > + > + /* Get policy based on index */ > + pseudo_vma.vm_policy = > + mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, > + index); > + > + page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); > + mpol_cond_put(pseudo_vma.vm_policy); > + if (IS_ERR(page)) { > + error = PTR_ERR(page); > + goto out; > + } > + clear_huge_page(page, addr, pages_per_huge_page(h)); > + __SetPageUptodate(page); > + error = huge_add_to_page_cache(page, mapping, index); > + if (error) { > + put_page(page); > + /* Keep going if we see an -EEXIST */ > + if (error != -EEXIST) > + goto out; /* FIXME, need to free? */ > + } > + > + /* > + * page_put due to reference from alloc_huge_page() > + * unlock_page because locked by add_to_page_cache() > + */ > + put_page(page); Still needed if EEXIST? > + unlock_page(page); > + > + /* Increment addr for next huge page */ > + addr += hpage_size; > + } > + > + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) > + i_size_write(inode, offset + len); > + inode->i_ctime = CURRENT_TIME; > + spin_lock(&inode->i_lock); > + inode->i_private = NULL; > + spin_unlock(&inode->i_lock); > +out: > + mutex_unlock(&inode->i_mutex); > + return error; > +} > + > static void hugetlbfs_evict_inode(struct inode *inode) > { > struct resv_map *resv_map; > @@ -743,6 +881,7 @@ const struct file_operations hugetlbfs_file_operations = { > .fsync = noop_fsync, > .get_unmapped_area = hugetlb_get_unmapped_area, > .llseek = default_llseek, > + .fallocate = hugetlbfs_fallocate, > }; > > static const struct inode_operations hugetlbfs_dir_inode_operations = { > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 6425945..d96b88e 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -322,6 +322,8 @@ struct huge_bootmem_page { > #endif > }; > > +struct page *alloc_huge_page(struct vm_area_struct *vma, > + unsigned long addr, int avoid_reserve); > struct page *alloc_huge_page_node(struct hstate *h, int nid); > struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, > unsigned long addr, int avoid_reserve); > @@ -476,6 +478,7 @@ static inline bool hugepages_supported(void) > > #else /* CONFIG_HUGETLB_PAGE */ > struct hstate {}; > +#define alloc_huge_page(v, a, r) NULL > #define alloc_huge_page_node(h, nid) NULL > #define alloc_huge_page_noerr(v, a, r) NULL > #define alloc_bootmem_huge_page(h) NULL > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 7cda328..e130c6d 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -1363,7 +1363,7 @@ static void vma_commit_reservation(struct hstate *h, > region_add(resv, idx, idx + 1); > } > > -static struct page *alloc_huge_page(struct vm_area_struct *vma, > +struct page *alloc_huge_page(struct vm_area_struct *vma, > unsigned long addr, int avoid_reserve) > { > struct hugepage_subpool *spool = subpool_vma(vma); > -- > 2.1.0 > > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() 2015-04-17 8:00 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Hillf Danton @ 2015-04-17 17:14 ` Mike Kravetz 0 siblings, 0 replies; 3+ messages in thread From: Mike Kravetz @ 2015-04-17 17:14 UTC (permalink / raw) To: Hillf Danton, Dave Hansen; +Cc: linux-kernel, linux-mm On 04/17/2015 01:00 AM, Hillf Danton wrote: >> + clear_huge_page(page, addr, pages_per_huge_page(h)); >> + __SetPageUptodate(page); >> + error = huge_add_to_page_cache(page, mapping, index); >> + if (error) { >> + put_page(page); >> + /* Keep going if we see an -EEXIST */ >> + if (error != -EEXIST) >> + goto out; /* FIXME, need to free? */ >> + } >> + >> + /* >> + * page_put due to reference from alloc_huge_page() >> + * unlock_page because locked by add_to_page_cache() >> + */ >> + put_page(page); > > Still needed if EEXIST? Nope. Good catch. I'll fix this in the next version. -- Mike Kravetz -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 3+ messages in thread
* [RFC PATCH 0/4] hugetlbfs: add fallocate support @ 2015-04-16 23:02 Mike Kravetz 2015-04-16 23:02 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Mike Kravetz 0 siblings, 1 reply; 3+ messages in thread From: Mike Kravetz @ 2015-04-16 23:02 UTC (permalink / raw) To: linux-mm, linux-kernel Cc: Dave Hansen, Naoya Horiguchi, David Rientjes, Hugh Dickins, Davidlohr Bueso, Aneesh Kumar, Mike Kravetz hugetlbfs is used today by applications that want a high degree of control over huge page usage. Often, large hugetlbfs files are used to map a large number huge pages into the application processes. The applications know when page ranges within these large files will no longer be used, and ideally would like to release them back to the subpool or global pools for other uses. The fallocate() system call provides an interface for preallocation and hole punching within files. This patch set adds fallocate functionality to hugetlbfs. Mike Kravetz (4): hugetlbfs: truncate_hugepages() takes a range of pages hugetlbfs: New huge_add_to_page_cache helper routine hugetlbfs: add hugetlbfs_fallocate() mm: madvise allow remove operation for hugetlbfs fs/hugetlbfs/inode.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/hugetlb.h | 5 ++ mm/hugetlb.c | 29 ++++++--- mm/madvise.c | 2 +- 4 files changed, 185 insertions(+), 15 deletions(-) -- 2.1.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 3+ messages in thread
* [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() 2015-04-16 23:02 [RFC PATCH 0/4] hugetlbfs: add fallocate support Mike Kravetz @ 2015-04-16 23:02 ` Mike Kravetz 0 siblings, 0 replies; 3+ messages in thread From: Mike Kravetz @ 2015-04-16 23:02 UTC (permalink / raw) To: linux-mm, linux-kernel Cc: Dave Hansen, Naoya Horiguchi, David Rientjes, Hugh Dickins, Davidlohr Bueso, Aneesh Kumar, Mike Kravetz This is based on the shmem version, but it has diverged quite a bit. We have no swap to worry about, nor the new file sealing. What this allows us to do is move physical memory in and out of a hugetlbfs file without having it mapped. This also gives us the ability to support MADV_REMOVE since it is currently implemented using fallocate(). MADV_REMOVE lets us remove data from the middle of a hugetlbfs file, which wasn't possible before. hugetlbfs fallocate only operates on whole huge pages. Based-on code-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> --- fs/hugetlbfs/inode.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 3 ++ mm/hugetlb.c | 2 +- 3 files changed, 143 insertions(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d5b67fd..6d48c8f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -12,6 +12,7 @@ #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ +#include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> @@ -377,6 +378,143 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart, loff_t lend) hugetlb_unreserve_pages(inode, start, freed); } +static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct hstate *h = hstate_inode(inode); + unsigned long hpage_size = huge_page_size(h); + loff_t hole_start, hole_end; + + /* + * For hole punch round up the beginning offset of the hole and + * round down the end. + */ + hole_start = (offset + hpage_size - 1) & ~huge_page_mask(h); + hole_end = (offset + len - (hpage_size - 1)) * ~huge_page_mask(h); + + if ((u64)hole_end > (u64)hole_start) { + struct address_space *mapping = &inode->i_data; + + mutex_lock(&inode->i_mutex); + unmap_mapping_range(mapping, hole_start, hole_end, 0); + truncate_hugepages(inode, hole_start, hole_end); + mutex_unlock(&inode->i_mutex); + } + + return 0; +} + +static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_inode(inode); + struct vm_area_struct pseudo_vma; + unsigned long hpage_size = huge_page_size(h); + unsigned long hpage_shift = huge_page_shift(h); + pgoff_t start, index, end; + unsigned long addr; + int error; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (mode & FALLOC_FL_PUNCH_HOLE) + return hugetlbfs_punch_hole(inode, offset, len); + + /* + * Default preallocate case. + * For this range, start is rounded down and end is rounded up. + */ + start = offset >> hpage_shift; + end = (offset + len + hpage_size - 1) >> hpage_shift; + + mutex_lock(&inode->i_mutex); + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + /* + * Initialize a pseudo vma that just contains the policy used + * when allocating the huge pages. The actual policy field + * (vm_policy) is determined based on the index in the loop below. + */ + memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); + pseudo_vma.vm_start = 0; + pseudo_vma.vm_flags |= (VM_HUGETLB | VM_MAYSHARE); + pseudo_vma.vm_file = file; + + /* addr is the offset within the file (zero based) */ + addr = start * hpage_size; + for (index = start; index < end; index++) { + /* + * This is supposed to be the vaddr where the page is being + * faulted in, but we have no vaddr here. + */ + struct page *page; + int avoid_reserve = 1; + + cond_resched(); + + /* + * fallocate(2) manpage permits EINTR; we may have been + * interrupted because we are using up too much memory. + */ + if (signal_pending(current)) { + error = -EINTR; + break; + } + page = find_get_page(mapping, index); + if (page) { + put_page(page); + continue; + } + + /* Get policy based on index */ + pseudo_vma.vm_policy = + mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, + index); + + page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + mpol_cond_put(pseudo_vma.vm_policy); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto out; + } + clear_huge_page(page, addr, pages_per_huge_page(h)); + __SetPageUptodate(page); + error = huge_add_to_page_cache(page, mapping, index); + if (error) { + put_page(page); + /* Keep going if we see an -EEXIST */ + if (error != -EEXIST) + goto out; /* FIXME, need to free? */ + } + + /* + * page_put due to reference from alloc_huge_page() + * unlock_page because locked by add_to_page_cache() + */ + put_page(page); + unlock_page(page); + + /* Increment addr for next huge page */ + addr += hpage_size; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); + inode->i_ctime = CURRENT_TIME; + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + mutex_unlock(&inode->i_mutex); + return error; +} + static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; @@ -743,6 +881,7 @@ const struct file_operations hugetlbfs_file_operations = { .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, + .fallocate = hugetlbfs_fallocate, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6425945..d96b88e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -322,6 +322,8 @@ struct huge_bootmem_page { #endif }; +struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); @@ -476,6 +478,7 @@ static inline bool hugepages_supported(void) #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +#define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_noerr(v, a, r) NULL #define alloc_bootmem_huge_page(h) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7cda328..e130c6d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1363,7 +1363,7 @@ static void vma_commit_reservation(struct hstate *h, region_add(resv, idx, idx + 1); } -static struct page *alloc_huge_page(struct vm_area_struct *vma, +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); -- 2.1.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2015-04-17 17:14 UTC | newest] Thread overview: 3+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- [not found] <00fc01d078e3$63428ec0$29c7ac40$@alibaba-inc.com> 2015-04-17 8:00 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Hillf Danton 2015-04-17 17:14 ` Mike Kravetz 2015-04-16 23:02 [RFC PATCH 0/4] hugetlbfs: add fallocate support Mike Kravetz 2015-04-16 23:02 ` [RFC PATCH 3/4] hugetlbfs: add hugetlbfs_fallocate() Mike Kravetz
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).