* Re: [PATCHv9-rebased2 28/37] shmem: get_unmapped_area align huge page [not found] <054e01d1c86d$c7261fd0$55725f70$@alibaba-inc.com> @ 2016-06-17 8:06 ` Hillf Danton 2016-06-17 11:29 ` Kirill A. Shutemov 0 siblings, 1 reply; 3+ messages in thread From: Hillf Danton @ 2016-06-17 8:06 UTC (permalink / raw) To: Hugh Dickins, Kirill A. Shutemov; +Cc: linux-kernel, linux-mm > > +unsigned long shmem_get_unmapped_area(struct file *file, > + unsigned long uaddr, unsigned long len, > + unsigned long pgoff, unsigned long flags) > +{ > + unsigned long (*get_area)(struct file *, > + unsigned long, unsigned long, unsigned long, unsigned long); > + unsigned long addr; > + unsigned long offset; > + unsigned long inflated_len; > + unsigned long inflated_addr; > + unsigned long inflated_offset; > + > + if (len > TASK_SIZE) > + return -ENOMEM; > + > + get_area = current->mm->get_unmapped_area; > + addr = get_area(file, uaddr, len, pgoff, flags); > + > + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) > + return addr; > + if (IS_ERR_VALUE(addr)) > + return addr; > + if (addr & ~PAGE_MASK) > + return addr; > + if (addr > TASK_SIZE - len) > + return addr; > + > + if (shmem_huge == SHMEM_HUGE_DENY) > + return addr; > + if (len < HPAGE_PMD_SIZE) > + return addr; > + if (flags & MAP_FIXED) > + return addr; > + /* > + * Our priority is to support MAP_SHARED mapped hugely; > + * and support MAP_PRIVATE mapped hugely too, until it is COWed. > + * But if caller specified an address hint, respect that as before. > + */ > + if (uaddr) > + return addr; > + > + if (shmem_huge != SHMEM_HUGE_FORCE) { > + struct super_block *sb; > + > + if (file) { > + VM_BUG_ON(file->f_op != &shmem_file_operations); > + sb = file_inode(file)->i_sb; > + } else { > + /* > + * Called directly from mm/mmap.c, or drivers/char/mem.c > + * for "/dev/zero", to create a shared anonymous object. > + */ > + if (IS_ERR(shm_mnt)) > + return addr; > + sb = shm_mnt->mnt_sb; > + } > + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) > + return addr; Try to ask for a larger arena if huge page is not disabled for the mount(s/!=/==/)? > + } > + > + offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); > + if (offset && offset + len < 2 * HPAGE_PMD_SIZE) > + return addr; > + if ((addr & (HPAGE_PMD_SIZE-1)) == offset) > + return addr; > + > + inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; > + if (inflated_len > TASK_SIZE) > + return addr; > + if (inflated_len < len) > + return addr; > + > + inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); > + if (IS_ERR_VALUE(inflated_addr)) > + return addr; > + if (inflated_addr & ~PAGE_MASK) > + return addr; > + > + inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); > + inflated_addr += offset - inflated_offset; > + if (inflated_offset > offset) > + inflated_addr += HPAGE_PMD_SIZE; > + > + if (inflated_addr > TASK_SIZE - len) > + return addr; > + return inflated_addr; > +} > + > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCHv9-rebased2 28/37] shmem: get_unmapped_area align huge page 2016-06-17 8:06 ` [PATCHv9-rebased2 28/37] shmem: get_unmapped_area align huge page Hillf Danton @ 2016-06-17 11:29 ` Kirill A. Shutemov 0 siblings, 0 replies; 3+ messages in thread From: Kirill A. Shutemov @ 2016-06-17 11:29 UTC (permalink / raw) To: Hillf Danton; +Cc: Hugh Dickins, Kirill A. Shutemov, linux-kernel, linux-mm On Fri, Jun 17, 2016 at 04:06:33PM +0800, Hillf Danton wrote: > > > > +unsigned long shmem_get_unmapped_area(struct file *file, > > + unsigned long uaddr, unsigned long len, > > + unsigned long pgoff, unsigned long flags) > > +{ > > + unsigned long (*get_area)(struct file *, > > + unsigned long, unsigned long, unsigned long, unsigned long); > > + unsigned long addr; > > + unsigned long offset; > > + unsigned long inflated_len; > > + unsigned long inflated_addr; > > + unsigned long inflated_offset; > > + > > + if (len > TASK_SIZE) > > + return -ENOMEM; > > + > > + get_area = current->mm->get_unmapped_area; > > + addr = get_area(file, uaddr, len, pgoff, flags); > > + > > + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) > > + return addr; > > + if (IS_ERR_VALUE(addr)) > > + return addr; > > + if (addr & ~PAGE_MASK) > > + return addr; > > + if (addr > TASK_SIZE - len) > > + return addr; > > + > > + if (shmem_huge == SHMEM_HUGE_DENY) > > + return addr; > > + if (len < HPAGE_PMD_SIZE) > > + return addr; > > + if (flags & MAP_FIXED) > > + return addr; > > + /* > > + * Our priority is to support MAP_SHARED mapped hugely; > > + * and support MAP_PRIVATE mapped hugely too, until it is COWed. > > + * But if caller specified an address hint, respect that as before. > > + */ > > + if (uaddr) > > + return addr; > > + > > + if (shmem_huge != SHMEM_HUGE_FORCE) { > > + struct super_block *sb; > > + > > + if (file) { > > + VM_BUG_ON(file->f_op != &shmem_file_operations); > > + sb = file_inode(file)->i_sb; > > + } else { > > + /* > > + * Called directly from mm/mmap.c, or drivers/char/mem.c > > + * for "/dev/zero", to create a shared anonymous object. > > + */ > > + if (IS_ERR(shm_mnt)) > > + return addr; > > + sb = shm_mnt->mnt_sb; > > + } > > + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) > > + return addr; > > Try to ask for a larger arena if huge page is not disabled for > the mount(s/!=/==/)? <facepalm> I mostly test with SHMEM_HUGE_FORCE as it puts more stress on the system. Fixup: diff --git a/mm/shmem.c b/mm/shmem.c index e2c6b6e8387a..3f4ebe84ef61 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1979,7 +1979,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, return addr; sb = shm_mnt->mnt_sb; } - if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) return addr; } -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCHv9 00/32] THP-enabled tmpfs/shmem using compound pages @ 2016-06-06 14:06 Kirill A. Shutemov 2016-06-15 20:06 ` [PATCHv9-rebased2 00/37] " Kirill A. Shutemov 0 siblings, 1 reply; 3+ messages in thread From: Kirill A. Shutemov @ 2016-06-06 14:06 UTC (permalink / raw) To: Hugh Dickins, Andrea Arcangeli, Andrew Morton Cc: Dave Hansen, Vlastimil Babka, Christoph Lameter, Naoya Horiguchi, Jerome Marchand, Yang Shi, Sasha Levin, Andres Lagar-Cavilla, Ning Qu, linux-kernel, linux-mm, linux-fsdevel, Kirill A. Shutemov This is rebased version of my implementation of huge pages support for tmpfs. There are few fixes by Hugh since v8. Rebase on v4.7-rc1 was somewhat painful, because of changes in radix-tree API, but everything looks fine now. Andrew, please consider applying the patchset to -mm tree. The patchset is on top of v4.7-rc1 plus khugepaged updates from -mm tree. Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git hugetmpfs/v9 == Changelog == v9: - rebased to v4.7-rc1; - truncate_inode_pages_range() and invalidate_inode_pages2_range() are adjusted to use page_to_pgoff() (Hugh); - filemap: fix refcounting in error path in radix-tree opeartions (Hugh); - khugepaged: handle !PageUptodate() pages (due fallocate() ?) during collapse (Hugh); - shmem_unused_huge_shrink: - fix shrinklist_len accounting (Hugh); - call find_lock_page() for alligned address, so we will not get tail page and don't crash in PageTransHuge() (Hugh); v8: - khugepaged updates: + mark collapsed page dirty, otherwise vmscan would discard it; + account pages to mapping->nrpages on shmem_charge; + fix a situation when not all tail pages put on radix tree on collapse; + fix off-by-one in loop-exit condition in khugepaged_scan_shmem(); + use radix_tree_iter_next/radix_tree_iter_retry instead of gotos; + fix build withount CONFIG_SHMEM (again); - split huge pages beyond i_size under memory pressure; - disable huge tmpfs on Power, as it makes use of deposited page tables, we don't have; - fix filesystem size limit accouting; - mark page referenced on split_huge_pmd() if the pmd is young; - uncharge pages from shmem, removed during split_huge_page(); - make shmem_inode_info::lock irq-safe -- required by khugepaged; v7: - khugepaged updates: + fix page leak/page cache corruption on collapse fail; + filter out VMAs not suitable for huge pages due misaligned vm_pgoff; + fix build without CONFIG_SHMEM; + drop few over-protective checks; - fix bogus VM_BUG_ON() in __delete_from_page_cache(); v6: - experimental collapse support; - fix swapout mapped huge pages; - fix page leak in faularound code; - fix exessive huge page allocation with huge=within_size; - rename VM_NO_THP to VM_NO_KHUGEPAGED; - fix condition in hugepage_madvise(); - accounting reworked again; v5: - add FileHugeMapped to /proc/PID/smaps; - make FileHugeMapped in meminfo aligned with other fields; - Documentation/vm/transhuge.txt updated; v4: - first four patch were applied to -mm tree; - drop pages beyond i_size on split_huge_pages; - few small random bugfixes; v3: - huge= mountoption now can have values always, within_size, advice and never; - sysctl handle is replaced with sysfs knob; - MADV_HUGEPAGE/MADV_NOHUGEPAGE is now respected on page allocation via page fault; - mlock() handling had been fixed; - bunch of smaller bugfixes and cleanups. == Design overview == Huge pages are allocated by shmem when it's allowed (by mount option) and there's no entries for the range in radix-tree. Huge page is represented by HPAGE_PMD_NR entries in radix-tree. MM core maps a page with PMD if ->fault() returns huge page and the VMA is suitable for huge pages (size, alignment). There's no need into two requests to file system: filesystem returns huge page if it can, graceful fallback to small pages otherwise. As with DAX, split_huge_pmd() is implemented by unmapping the PMD: we can re-fault the page with PTEs later. Basic scheme for split_huge_page() is the same as for anon-THP. Few differences: - File pages are on radix-tree, so we have head->_count offset by HPAGE_PMD_NR. The count got distributed to small pages during split. - mapping->tree_lock prevents non-lockless access to pages under split over radix-tree; - Lockless access is prevented by setting the head->_count to 0 during split, so get_page_unless_zero() would fail; - After split, some pages can be beyond i_size. We drop them from radix-tree. - We don't setup migration entries. Just unmap pages. It helps handling cases when i_size is in the middle of the page: no need handle unmap pages beyond i_size manually. COW mapping handled on PTE-level. It's not clear how beneficial would be allocation of huge pages on COW faults. And it would require some code to make them work. I think at some point we can consider teaching khugepaged to collapse pages in COW mappings, but allocating huge on fault is probably overkill. As with anon THP, we mlock file huge page only if it mapped with PMD. PTE-mapped THPs are never mlocked. This way we can avoid all sorts of scenarios when we can leak mlocked page. As with anon THP, we split huge page on swap out. Truncate and punch hole that only cover part of THP range is implemented by zero out this part of THP. This have visible effect on fallocate(FALLOC_FL_PUNCH_HOLE) behaviour. As we don't really create hole in this case, lseek(SEEK_HOLE) may have inconsistent results depending what pages happened to be allocated. I don't think this will be a problem. We track per-super_block list of inodes which potentially have huge page partly beyond i_size. Under memory pressure or if we hit -ENOSPC, we split such pages in order to recovery memory. The list is per-sb, as we need to split a page from our filesystem if hit -ENOSPC (-o size= limit) during shmem_getpage_gfp() to free some space. == Patchset overview == [01/29] Update documentation on THP vs. mlock. I've posted it separately before. It can go in. [02-04/29] Rework fault path and rmap to handle file pmd. Unlike DAX with vm_ops->pmd_fault, we don't need to ask filesystem twice -- first for huge page and then for small. If ->fault happened to return huge page and VMA is suitable for mapping it as huge, we would do so. [05/29] Add support for huge file pages in rmap; [06-15/29] Various preparation of THP core for file pages. [16-20/29] Various preparation of MM core for file pages. [21-24/29] And finally, bring huge pages into tmpfs/shmem. [25/29] Wire up madvise() existing hints for file THP. We can implement fadvise() later. [26/29] Documentation update. [27-29/29] Extend khugepaged to support shmem/tmpfs. Hugh Dickins (1): shmem: get_unmapped_area align huge page Kirill A. Shutemov (31): thp, mlock: update unevictable-lru.txt mm: do not pass mm_struct into handle_mm_fault mm: introduce fault_env mm: postpone page table allocation until we have page to map rmap: support file thp mm: introduce do_set_pmd() thp, vmstats: add counters for huge file pages thp: support file pages in zap_huge_pmd() thp: handle file pages in split_huge_pmd() thp: handle file COW faults thp: skip file huge pmd on copy_huge_pmd() thp: prepare change_huge_pmd() for file thp thp: run vma_adjust_trans_huge() outside i_mmap_rwsem thp: file pages support for split_huge_page() thp, mlock: do not mlock PTE-mapped file huge pages vmscan: split file huge pages before paging them out page-flags: relax policy for PG_mappedtodisk and PG_reclaim radix-tree: implement radix_tree_maybe_preload_order() filemap: prepare find and delete operations for huge pages truncate: handle file thp mm, rmap: account shmem thp pages shmem: prepare huge= mount option and sysfs knob shmem: add huge pages support shmem, thp: respect MADV_{NO,}HUGEPAGE for file mappings thp: extract khugepaged from mm/huge_memory.c khugepaged: move up_read(mmap_sem) out of khugepaged_alloc_page() shmem: make shmem_inode_info::lock irq-safe khugepaged: add support of collapse for tmpfs/shmem pages thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE shmem: split huge pages beyond i_size under memory pressure thp: update Documentation/{vm/transhuge,filesystems/proc}.txt Documentation/filesystems/Locking | 10 +- Documentation/filesystems/proc.txt | 9 + Documentation/vm/transhuge.txt | 128 ++- Documentation/vm/unevictable-lru.txt | 21 + arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/avr32/mm/fault.c | 2 +- arch/cris/mm/fault.c | 2 +- arch/frv/mm/fault.c | 2 +- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m32r/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/metag/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/mn10300/mm/fault.c | 2 +- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/score/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/sparc/mm/fault_32.c | 4 +- arch/sparc/mm/fault_64.c | 2 +- arch/tile/mm/fault.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/unicore32/mm/fault.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- drivers/base/node.c | 13 +- drivers/char/mem.c | 24 + drivers/iommu/amd_iommu_v2.c | 3 +- drivers/iommu/intel-svm.c | 2 +- fs/proc/meminfo.c | 7 +- fs/proc/task_mmu.c | 10 +- fs/userfaultfd.c | 22 +- include/linux/huge_mm.h | 36 +- include/linux/khugepaged.h | 6 + include/linux/mm.h | 51 +- include/linux/mmzone.h | 4 +- include/linux/page-flags.h | 19 +- include/linux/radix-tree.h | 1 + include/linux/rmap.h | 2 +- include/linux/shmem_fs.h | 45 +- include/linux/userfaultfd_k.h | 8 +- include/linux/vm_event_item.h | 7 + include/trace/events/huge_memory.h | 3 +- ipc/shm.c | 10 +- lib/radix-tree.c | 84 +- mm/Kconfig | 8 + mm/Makefile | 2 +- mm/filemap.c | 217 ++-- mm/gup.c | 7 +- mm/huge_memory.c | 2102 ++++++---------------------------- mm/internal.h | 4 +- mm/khugepaged.c | 1911 +++++++++++++++++++++++++++++++ mm/ksm.c | 5 +- mm/memory.c | 879 +++++++------- mm/mempolicy.c | 4 +- mm/migrate.c | 5 +- mm/mmap.c | 26 +- mm/nommu.c | 3 +- mm/page-writeback.c | 1 + mm/page_alloc.c | 21 + mm/rmap.c | 78 +- mm/shmem.c | 918 +++++++++++++-- mm/swap.c | 2 + mm/truncate.c | 28 +- mm/util.c | 6 + mm/vmscan.c | 6 + mm/vmstat.c | 4 + 76 files changed, 4333 insertions(+), 2491 deletions(-) create mode 100644 mm/khugepaged.c -- 2.8.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCHv9-rebased2 00/37] THP-enabled tmpfs/shmem using compound pages 2016-06-06 14:06 [PATCHv9 00/32] THP-enabled tmpfs/shmem using compound pages Kirill A. Shutemov @ 2016-06-15 20:06 ` Kirill A. Shutemov 2016-06-15 20:06 ` [PATCHv9-rebased2 28/37] shmem: get_unmapped_area align huge page Kirill A. Shutemov 0 siblings, 1 reply; 3+ messages in thread From: Kirill A. Shutemov @ 2016-06-15 20:06 UTC (permalink / raw) To: Hugh Dickins, Andrea Arcangeli, Andrew Morton Cc: Dave Hansen, Vlastimil Babka, Christoph Lameter, Naoya Horiguchi, Jerome Marchand, Yang Shi, Sasha Levin, Andres Lagar-Cavilla, Ning Qu, linux-kernel, linux-mm, linux-fsdevel, Ebru Akagunduz, Kirill A. Shutemov Andrew, As requested, here's refreshed version of the patchset. During preparation, Ebru mentionedi (on irc) on that she wanted to withdraw mm-thp-avoid-unnecessary-swapin-in-khugepaged.patch from mm tree, but it's difficult in current state of the tree. So I did rebase removing the patch. The patchset below is aimed to replace patches in your series, staring with mm-vmstat-calculate-particular-vm-event.patch (it's not necessary after mm-thp-avoid-unnecessary-swapin-in-khugepaged.patch removal) up to end of my patchset. I also took opportunity to address Vlastimil's concern about 'pmd' re-validiation after mmap_sem drop (you mentioned it in series file). See patch 05/37. I did few sanity check. Everything looks good. Hopefully, I didn't screw up anything on the way. :) Andrew Morton (1): mm-thp-make-swapin-readahead-under-down_read-of-mmap_sem-fix-2-fix Ebru Akagunduz (2): mm, thp: make swapin readahead under down_read of mmap_sem mm, thp: fix locking inconsistency in collapse_huge_page Hugh Dickins (1): shmem: get_unmapped_area align huge page Kirill A. Shutemov (33): mm-thp-make-swapin-readahead-under-down_read-of-mmap_sem-fix khugepaged: recheck pmd after mmap_sem re-acquired thp, mlock: update unevictable-lru.txt mm: do not pass mm_struct into handle_mm_fault mm: introduce fault_env mm: postpone page table allocation until we have page to map rmap: support file thp mm: introduce do_set_pmd() thp, vmstats: add counters for huge file pages thp: support file pages in zap_huge_pmd() thp: handle file pages in split_huge_pmd() thp: handle file COW faults thp: skip file huge pmd on copy_huge_pmd() thp: prepare change_huge_pmd() for file thp thp: run vma_adjust_trans_huge() outside i_mmap_rwsem thp: file pages support for split_huge_page() thp, mlock: do not mlock PTE-mapped file huge pages vmscan: split file huge pages before paging them out page-flags: relax policy for PG_mappedtodisk and PG_reclaim radix-tree: implement radix_tree_maybe_preload_order() filemap: prepare find and delete operations for huge pages truncate: handle file thp mm, rmap: account shmem thp pages shmem: prepare huge= mount option and sysfs knob shmem: add huge pages support shmem, thp: respect MADV_{NO,}HUGEPAGE for file mappings thp: extract khugepaged from mm/huge_memory.c khugepaged: move up_read(mmap_sem) out of khugepaged_alloc_page() shmem: make shmem_inode_info::lock irq-safe khugepaged: add support of collapse for tmpfs/shmem pages thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE shmem: split huge pages beyond i_size under memory pressure thp: update Documentation/{vm/transhuge,filesystems/proc}.txt Documentation/filesystems/Locking | 10 +- Documentation/filesystems/proc.txt | 9 + Documentation/vm/transhuge.txt | 128 ++- Documentation/vm/unevictable-lru.txt | 21 + arch/alpha/mm/fault.c | 2 +- arch/arc/mm/fault.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/avr32/mm/fault.c | 2 +- arch/cris/mm/fault.c | 2 +- arch/frv/mm/fault.c | 2 +- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m32r/mm/fault.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/metag/mm/fault.c | 2 +- arch/microblaze/mm/fault.c | 2 +- arch/mips/mm/fault.c | 2 +- arch/mn10300/mm/fault.c | 2 +- arch/nios2/mm/fault.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/mm/fault.c | 2 +- arch/powerpc/mm/copro_fault.c | 2 +- arch/powerpc/mm/fault.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/score/mm/fault.c | 2 +- arch/sh/mm/fault.c | 2 +- arch/sparc/mm/fault_32.c | 4 +- arch/sparc/mm/fault_64.c | 2 +- arch/tile/mm/fault.c | 2 +- arch/um/kernel/trap.c | 2 +- arch/unicore32/mm/fault.c | 2 +- arch/x86/mm/fault.c | 2 +- arch/xtensa/mm/fault.c | 2 +- drivers/base/node.c | 13 +- drivers/char/mem.c | 24 + drivers/iommu/amd_iommu_v2.c | 3 +- drivers/iommu/intel-svm.c | 2 +- fs/proc/meminfo.c | 7 +- fs/proc/task_mmu.c | 10 +- fs/userfaultfd.c | 22 +- include/linux/huge_mm.h | 36 +- include/linux/khugepaged.h | 5 + include/linux/mm.h | 51 +- include/linux/mmzone.h | 4 +- include/linux/page-flags.h | 19 +- include/linux/radix-tree.h | 1 + include/linux/rmap.h | 2 +- include/linux/shmem_fs.h | 45 +- include/linux/userfaultfd_k.h | 8 +- include/linux/vm_event_item.h | 7 + include/trace/events/huge_memory.h | 3 +- ipc/shm.c | 10 +- lib/radix-tree.c | 84 +- mm/Kconfig | 8 + mm/Makefile | 2 +- mm/filemap.c | 217 ++-- mm/gup.c | 7 +- mm/huge_memory.c | 2048 ++++++---------------------------- mm/internal.h | 4 +- mm/khugepaged.c | 1913 +++++++++++++++++++++++++++++++ mm/ksm.c | 5 +- mm/memory.c | 860 +++++++------- mm/mempolicy.c | 2 +- mm/migrate.c | 5 +- mm/mmap.c | 26 +- mm/nommu.c | 3 +- mm/page-writeback.c | 1 + mm/page_alloc.c | 21 + mm/rmap.c | 78 +- mm/shmem.c | 918 +++++++++++++-- mm/swap.c | 2 + mm/truncate.c | 28 +- mm/util.c | 6 + mm/vmscan.c | 6 + mm/vmstat.c | 4 + 76 files changed, 4319 insertions(+), 2431 deletions(-) create mode 100644 mm/khugepaged.c -- 2.8.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 3+ messages in thread
* [PATCHv9-rebased2 28/37] shmem: get_unmapped_area align huge page 2016-06-15 20:06 ` [PATCHv9-rebased2 00/37] " Kirill A. Shutemov @ 2016-06-15 20:06 ` Kirill A. Shutemov 0 siblings, 0 replies; 3+ messages in thread From: Kirill A. Shutemov @ 2016-06-15 20:06 UTC (permalink / raw) To: Hugh Dickins, Andrea Arcangeli, Andrew Morton Cc: Dave Hansen, Vlastimil Babka, Christoph Lameter, Naoya Horiguchi, Jerome Marchand, Yang Shi, Sasha Levin, Andres Lagar-Cavilla, Ning Qu, linux-kernel, linux-mm, linux-fsdevel, Ebru Akagunduz, Kirill A . Shutemov From: Hugh Dickins <hughd@google.com> Provide a shmem_get_unmapped_area method in file_operations, called at mmap time to decide the mapping address. It could be conditional on CONFIG_TRANSPARENT_HUGEPAGE, but save #ifdefs in other places by making it unconditional. shmem_get_unmapped_area() first calls the usual mm->get_unmapped_area (which we treat as a black box, highly dependent on architecture and config and executable layout). Lots of conditions, and in most cases it just goes with the address that chose; but when our huge stars are rightly aligned, yet that did not provide a suitable address, go back to ask for a larger arena, within which to align the mapping suitably. There have to be some direct calls to shmem_get_unmapped_area(), not via the file_operations: because of the way shmem_zero_setup() is called to create a shmem object late in the mmap sequence, when MAP_SHARED is requested with MAP_ANONYMOUS or /dev/zero. Though this only matters when /proc/sys/vm/shmem_huge has been set. Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- drivers/char/mem.c | 24 ++++++++++++ include/linux/shmem_fs.h | 2 + ipc/shm.c | 6 ++- mm/mmap.c | 16 +++++++- mm/shmem.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 142 insertions(+), 4 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 71025c2f6bbb..9656f1095c19 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -22,6 +22,7 @@ #include <linux/device.h> #include <linux/highmem.h> #include <linux/backing-dev.h> +#include <linux/shmem_fs.h> #include <linux/splice.h> #include <linux/pfn.h> #include <linux/export.h> @@ -661,6 +662,28 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma) return 0; } +static unsigned long get_unmapped_area_zero(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ +#ifdef CONFIG_MMU + if (flags & MAP_SHARED) { + /* + * mmap_zero() will call shmem_zero_setup() to create a file, + * so use shmem's get_unmapped_area in case it can be huge; + * and pass NULL for file as in mmap.c's get_unmapped_area(), + * so as not to confuse shmem with our handle on "/dev/zero". + */ + return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags); + } + + /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +#else + return -ENOSYS; +#endif +} + static ssize_t write_full(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -768,6 +791,7 @@ static const struct file_operations zero_fops = { .read_iter = read_iter_zero, .write_iter = write_iter_zero, .mmap = mmap_zero, + .get_unmapped_area = get_unmapped_area_zero, #ifndef CONFIG_MMU .mmap_capabilities = zero_mmap_capabilities, #endif diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 466f18c73a49..ff2de4bab61f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -50,6 +50,8 @@ extern struct file *shmem_file_setup(const char *name, extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); +extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern bool shmem_mapping(struct address_space *mapping); extern void shmem_unlock_mapping(struct address_space *mapping); diff --git a/ipc/shm.c b/ipc/shm.c index 13282510bc0d..7fa5cbebbf19 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -476,13 +476,15 @@ static const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, -#ifndef CONFIG_MMU .get_unmapped_area = shm_get_unmapped_area, -#endif .llseek = noop_llseek, .fallocate = shm_fallocate, }; +/* + * shm_file_operations_huge is now identical to shm_file_operations, + * but we keep it distinct for the sake of is_file_shm_hugepages(). + */ static const struct file_operations shm_file_operations_huge = { .mmap = shm_mmap, .fsync = shm_fsync, diff --git a/mm/mmap.c b/mm/mmap.c index daabef097c78..25c2b4e0fbdc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -25,6 +25,7 @@ #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> @@ -1897,8 +1898,19 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return -ENOMEM; get_area = current->mm->get_unmapped_area; - if (file && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; + if (file) { + if (file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + } else if (flags & MAP_SHARED) { + /* + * mmap_region() will call shmem_zero_setup() to create a file, + * so use shmem's get_unmapped_area in case it can be huge. + * do_mmap_pgoff() will clear pgoff, so match alignment. + */ + pgoff = 0; + get_area = shmem_get_unmapped_area; + } + addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; diff --git a/mm/shmem.c b/mm/shmem.c index 2051e0685a43..f092d9aa129d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1513,6 +1513,94 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long uaddr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long (*get_area)(struct file *, + unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long addr; + unsigned long offset; + unsigned long inflated_len; + unsigned long inflated_addr; + unsigned long inflated_offset; + + if (len > TASK_SIZE) + return -ENOMEM; + + get_area = current->mm->get_unmapped_area; + addr = get_area(file, uaddr, len, pgoff, flags); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return addr; + if (IS_ERR_VALUE(addr)) + return addr; + if (addr & ~PAGE_MASK) + return addr; + if (addr > TASK_SIZE - len) + return addr; + + if (shmem_huge == SHMEM_HUGE_DENY) + return addr; + if (len < HPAGE_PMD_SIZE) + return addr; + if (flags & MAP_FIXED) + return addr; + /* + * Our priority is to support MAP_SHARED mapped hugely; + * and support MAP_PRIVATE mapped hugely too, until it is COWed. + * But if caller specified an address hint, respect that as before. + */ + if (uaddr) + return addr; + + if (shmem_huge != SHMEM_HUGE_FORCE) { + struct super_block *sb; + + if (file) { + VM_BUG_ON(file->f_op != &shmem_file_operations); + sb = file_inode(file)->i_sb; + } else { + /* + * Called directly from mm/mmap.c, or drivers/char/mem.c + * for "/dev/zero", to create a shared anonymous object. + */ + if (IS_ERR(shm_mnt)) + return addr; + sb = shm_mnt->mnt_sb; + } + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + return addr; + } + + offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); + if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + return addr; + if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + return addr; + + inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + if (inflated_len > TASK_SIZE) + return addr; + if (inflated_len < len) + return addr; + + inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); + if (IS_ERR_VALUE(inflated_addr)) + return addr; + if (inflated_addr & ~PAGE_MASK) + return addr; + + inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_addr += offset - inflated_offset; + if (inflated_offset > offset) + inflated_addr += HPAGE_PMD_SIZE; + + if (inflated_addr > TASK_SIZE - len) + return addr; + return inflated_addr; +} + #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { @@ -3259,6 +3347,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, + .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, @@ -3494,6 +3583,15 @@ void shmem_unlock_mapping(struct address_space *mapping) { } +#ifdef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif + void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); -- 2.8.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2016-06-17 11:29 UTC | newest] Thread overview: 3+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- [not found] <054e01d1c86d$c7261fd0$55725f70$@alibaba-inc.com> 2016-06-17 8:06 ` [PATCHv9-rebased2 28/37] shmem: get_unmapped_area align huge page Hillf Danton 2016-06-17 11:29 ` Kirill A. Shutemov 2016-06-06 14:06 [PATCHv9 00/32] THP-enabled tmpfs/shmem using compound pages Kirill A. Shutemov 2016-06-15 20:06 ` [PATCHv9-rebased2 00/37] " Kirill A. Shutemov 2016-06-15 20:06 ` [PATCHv9-rebased2 28/37] shmem: get_unmapped_area align huge page Kirill A. Shutemov
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).