linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: Andrea Arcangeli <aarcange@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>,
	Hugh Dickins <hughd@google.com>,
	Wu Fengguang <fengguang.wu@intel.com>, Jan Kara <jack@suse.cz>,
	Mel Gorman <mgorman@suse.de>,
	linux-mm@kvack.org, Andi Kleen <ak@linux.intel.com>,
	Matthew Wilcox <matthew.r.wilcox@intel.com>,
	"Kirill A. Shutemov" <kirill@shutemov.name>,
	Hillf Danton <dhillf@gmail.com>, Dave Hansen <dave@sr71.net>,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv3, RFC 31/34] thp: initial implementation of do_huge_linear_fault()
Date: Fri,  5 Apr 2013 14:59:55 +0300	[thread overview]
Message-ID: <1365163198-29726-32-git-send-email-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <1365163198-29726-1-git-send-email-kirill.shutemov@linux.intel.com>

From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

The function tries to create a new page mapping using huge pages. It
only called for not yet mapped pages.

As usual in THP, we fallback to small pages if we fail to allocate huge
page.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/huge_mm.h |    3 +
 mm/huge_memory.c        |  196 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 199 insertions(+)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b53e295..aa52c48 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -5,6 +5,9 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
 				      struct vm_area_struct *vma,
 				      unsigned long address, pmd_t *pmd,
 				      unsigned int flags);
+extern int do_huge_linear_fault(struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long address, pmd_t *pmd,
+		unsigned int flags);
 extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 			 struct vm_area_struct *vma);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c1d5f2b..ed4389b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
+#include <linux/writeback.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -864,6 +865,201 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	return 0;
 }
 
+int do_huge_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+	unsigned long haddr = address & HPAGE_PMD_MASK;
+	struct page *cow_page, *page, *dirty_page = NULL;
+	bool anon = false, fallback = false, page_mkwrite = false;
+	pgtable_t pgtable = NULL;
+	struct vm_fault vmf;
+	int ret;
+
+	/* Fallback if vm_pgoff and vm_start are not suitable */
+	if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
+			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
+		return do_fallback(mm, vma, address, pmd, flags);
+
+	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+		return do_fallback(mm, vma, address, pmd, flags);
+
+	if (unlikely(khugepaged_enter(vma)))
+		return VM_FAULT_OOM;
+
+	/*
+	 * If we do COW later, allocate page before taking lock_page()
+	 * on the file cache page. This will reduce lock holding time.
+	 */
+	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+		if (unlikely(anon_vma_prepare(vma)))
+			return VM_FAULT_OOM;
+
+		cow_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+				vma, haddr, numa_node_id(), 0);
+		if (!cow_page) {
+			count_vm_event(THP_FAULT_FALLBACK);
+			return do_fallback(mm, vma, address, pmd, flags);
+		}
+		count_vm_event(THP_FAULT_ALLOC);
+		if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
+			page_cache_release(cow_page);
+			return do_fallback(mm, vma, address, pmd, flags);
+		}
+	} else
+		cow_page = NULL;
+
+	pgtable = pte_alloc_one(mm, haddr);
+	if (unlikely(!pgtable)) {
+		ret = VM_FAULT_OOM;
+		goto uncharge_out;
+	}
+
+	vmf.virtual_address = (void __user *)haddr;
+	vmf.pgoff = ((haddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+	vmf.flags = flags;
+	vmf.page = NULL;
+
+	ret = vma->vm_ops->huge_fault(vma, &vmf);
+	if (unlikely(ret & VM_FAULT_OOM))
+		goto uncharge_out_fallback;
+	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+		goto uncharge_out;
+
+	if (unlikely(PageHWPoison(vmf.page))) {
+		if (ret & VM_FAULT_LOCKED)
+			unlock_page(vmf.page);
+		ret = VM_FAULT_HWPOISON;
+		goto uncharge_out;
+	}
+
+	/*
+	 * For consistency in subsequent calls, make the faulted page always
+	 * locked.
+	 */
+	if (unlikely(!(ret & VM_FAULT_LOCKED)))
+		lock_page(vmf.page);
+	else
+		VM_BUG_ON(!PageLocked(vmf.page));
+
+	/*
+	 * Should we do an early C-O-W break?
+	 */
+	page = vmf.page;
+	if (flags & FAULT_FLAG_WRITE) {
+		if (!(vma->vm_flags & VM_SHARED)) {
+			page = cow_page;
+			anon = true;
+			copy_user_huge_page(page, vmf.page, haddr, vma,
+					HPAGE_PMD_NR);
+			__SetPageUptodate(page);
+		} else if (vma->vm_ops->page_mkwrite) {
+			int tmp;
+
+			unlock_page(page);
+			vmf.flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE;
+			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
+			if (unlikely(tmp &
+				  (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+				ret = tmp;
+				goto unwritable_page;
+			}
+			if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+				lock_page(page);
+				if (!page->mapping) {
+					ret = 0; /* retry the fault */
+					unlock_page(page);
+					goto unwritable_page;
+				}
+			} else
+				VM_BUG_ON(!PageLocked(page));
+			page_mkwrite = true;
+		}
+	}
+
+	VM_BUG_ON(!PageCompound(page));
+
+	spin_lock(&mm->page_table_lock);
+	if (likely(pmd_none(*pmd))) {
+		pmd_t entry;
+
+		flush_icache_page(vma, page);
+		entry = mk_huge_pmd(page, vma->vm_page_prot);
+		if (flags & FAULT_FLAG_WRITE)
+			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+		if (anon) {
+			add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+			page_add_new_anon_rmap(page, vma, haddr);
+		} else {
+			add_mm_counter(mm, MM_FILEPAGES, HPAGE_PMD_NR);
+			page_add_file_rmap(page);
+			if (flags & FAULT_FLAG_WRITE) {
+				dirty_page = page;
+				get_page(dirty_page);
+			}
+		}
+		set_pmd_at(mm, haddr, pmd, entry);
+		pgtable_trans_huge_deposit(mm, pgtable);
+		mm->nr_ptes++;
+
+		/* no need to invalidate: a not-present page won't be cached */
+		update_mmu_cache_pmd(vma, address, pmd);
+	} else {
+		if (cow_page)
+			mem_cgroup_uncharge_page(cow_page);
+		if (anon)
+			page_cache_release(page);
+		else
+			anon = true; /* no anon but release faulted_page */
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	if (dirty_page) {
+		struct address_space *mapping = page->mapping;
+		bool dirtied = false;
+
+		if (set_page_dirty(dirty_page))
+			dirtied = true;
+		unlock_page(dirty_page);
+		put_page(dirty_page);
+		if ((dirtied || page_mkwrite) && mapping) {
+			/*
+			 * Some device drivers do not set page.mapping but still
+			 * dirty their pages
+			 */
+			balance_dirty_pages_ratelimited(mapping);
+		}
+
+		/* file_update_time outside page_lock */
+		if (vma->vm_file && !page_mkwrite)
+			file_update_time(vma->vm_file);
+	} else {
+		unlock_page(vmf.page);
+		if (anon)
+			page_cache_release(vmf.page);
+	}
+
+	return ret;
+
+unwritable_page:
+	pte_free(mm, pgtable);
+	page_cache_release(page);
+	return ret;
+uncharge_out_fallback:
+	fallback = true;
+uncharge_out:
+	if (pgtable)
+		pte_free(mm, pgtable);
+	if (cow_page) {
+		mem_cgroup_uncharge_page(cow_page);
+		page_cache_release(cow_page);
+	}
+
+	if (fallback)
+		return do_fallback(mm, vma, address, pmd, flags);
+	else
+		return ret;
+}
+
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *vma)
-- 
1.7.10.4

  parent reply	other threads:[~2013-04-05 11:59 UTC|newest]

Thread overview: 55+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-04-05 11:59 [PATCHv3, RFC 00/34] Transparent huge page cache Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 01/34] mm: drop actor argument of do_generic_file_read() Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 02/34] block: implement add_bdi_stat() Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 03/34] mm: implement zero_huge_user_segment and friends Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 04/34] radix-tree: implement preload for multiple contiguous elements Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 05/34] memcg, thp: charge huge cache pages Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 06/34] thp, mm: avoid PageUnevictable on active/inactive lru lists Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 07/34] thp, mm: basic defines for transparent huge page cache Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 08/34] thp, mm: introduce mapping_can_have_hugepages() predicate Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 09/34] thp: represent file thp pages in meminfo and friends Kirill A. Shutemov
2013-04-08 19:38   ` Dave Hansen
2013-04-16 14:49     ` Kirill A. Shutemov
2013-04-16 15:11       ` Dave Hansen
2013-04-05 11:59 ` [PATCHv3, RFC 10/34] thp, mm: rewrite add_to_page_cache_locked() to support huge pages Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 11/34] mm: trace filemap: dump page order Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 12/34] thp, mm: rewrite delete_from_page_cache() to support huge pages Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 13/34] thp, mm: trigger bug in replace_page_cache_page() on THP Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 14/34] thp, mm: locking tail page is a bug Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 15/34] thp, mm: handle tail pages in page_cache_get_speculative() Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 16/34] thp, mm: add event counters for huge page alloc on write to a file Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 17/34] thp, mm: implement grab_thp_write_begin() Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 18/34] thp, mm: naive support of thp in generic read/write routines Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 19/34] thp, libfs: initial support of thp in simple_read/write_begin/write_end Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 20/34] thp: handle file pages in split_huge_page() Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 21/34] thp: wait_split_huge_page(): serialize over i_mmap_mutex too Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 22/34] thp, mm: truncate support for transparent huge page cache Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 23/34] thp, mm: split huge page on mmap file page Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 24/34] ramfs: enable transparent huge page cache Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 25/34] x86-64, mm: proper alignment mappings with hugepages Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 26/34] mm: add huge_fault() callback to vm_operations_struct Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 27/34] thp: prepare zap_huge_pmd() to uncharge file pages Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 28/34] thp: move maybe_pmd_mkwrite() out of mk_huge_pmd() Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 29/34] thp, mm: basic huge_fault implementation for generic_file_vm_ops Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 30/34] thp: extract fallback path from do_huge_pmd_anonymous_page() to a function Kirill A. Shutemov
2013-04-05 11:59 ` Kirill A. Shutemov [this message]
2013-04-08 18:46   ` [PATCHv3, RFC 31/34] thp: initial implementation of do_huge_linear_fault() Dave Hansen
2013-04-08 18:52   ` Dave Hansen
2013-04-17 14:38     ` Kirill A. Shutemov
2013-04-17 22:07       ` Dave Hansen
2013-04-18 16:09         ` Kirill A. Shutemov
2013-04-18 16:19           ` Kirill A. Shutemov
2013-04-18 16:20           ` Dave Hansen
2013-04-18 16:38             ` Kirill A. Shutemov
2013-04-18 16:42               ` Dave Hansen
2013-04-05 11:59 ` [PATCHv3, RFC 32/34] thp: handle write-protect exception to file-backed huge pages Kirill A. Shutemov
2013-04-08 19:07   ` Dave Hansen
2013-04-26 15:31     ` Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 33/34] thp: call __vma_adjust_trans_huge() for file-backed VMA Kirill A. Shutemov
2013-04-05 11:59 ` [PATCHv3, RFC 34/34] thp: map file-backed huge pages on fault Kirill A. Shutemov
2013-04-07  0:40 ` [PATCHv3, RFC 00/34] Transparent huge page cache Ric Mason
2013-04-15 16:02 ` IOZone with transparent " Kirill A. Shutemov
2013-04-15 18:17 ` [RESEND] " Kirill A. Shutemov
2013-04-15 23:19   ` Dave Hansen
2013-04-16  5:57     ` Kirill A. Shutemov
2013-04-16  6:11       ` Dave Hansen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1365163198-29726-32-git-send-email-kirill.shutemov@linux.intel.com \
    --to=kirill.shutemov@linux.intel.com \
    --cc=aarcange@redhat.com \
    --cc=ak@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=dave@sr71.net \
    --cc=dhillf@gmail.com \
    --cc=fengguang.wu@intel.com \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=kirill@shutemov.name \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=matthew.r.wilcox@intel.com \
    --cc=mgorman@suse.de \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).