linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: Hugh Dickins <hughd@google.com>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Christoph Lameter <cl@gentwo.org>,
	Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>,
	Jerome Marchand <jmarchan@redhat.com>,
	Yang Shi <yang.shi@linaro.org>,
	Sasha Levin <sasha.levin@oracle.com>,
	Andres Lagar-Cavilla <andreslc@google.com>,
	Ning Qu <quning@gmail.com>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	linux-fsdevel@vger.kernel.org,
	"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv8 19/32] filemap: prepare find and delete operations for huge pages
Date: Thu, 12 May 2016 18:40:59 +0300	[thread overview]
Message-ID: <1463067672-134698-20-git-send-email-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <1463067672-134698-1-git-send-email-kirill.shutemov@linux.intel.com>

For now, we would have HPAGE_PMD_NR entries in radix tree for every huge
page. That's suboptimal and it will be changed to use Matthew's
multi-order entries later.

'add' operation is not changed, because we don't need it to implement
hugetmpfs: shmem uses its own implementation.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/filemap.c | 187 ++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 134 insertions(+), 53 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 7e982835d4ec..bf29ab4f87dc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -110,43 +110,18 @@
  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
-static void page_cache_tree_delete(struct address_space *mapping,
-				   struct page *page, void *shadow)
+static void __page_cache_tree_delete(struct address_space *mapping,
+		struct radix_tree_node *node, void **slot, unsigned long index,
+		void *shadow)
 {
-	struct radix_tree_node *node;
-	unsigned long index;
-	unsigned int offset;
 	unsigned int tag;
-	void **slot;
-
-	VM_BUG_ON(!PageLocked(page));
 
-	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
-
-	if (shadow) {
-		mapping->nrexceptional++;
-		/*
-		 * Make sure the nrexceptional update is committed before
-		 * the nrpages update so that final truncate racing
-		 * with reclaim does not see both counters 0 at the
-		 * same time and miss a shadow entry.
-		 */
-		smp_wmb();
-	}
-	mapping->nrpages--;
-
-	if (!node) {
-		/* Clear direct pointer tags in root node */
-		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
-		radix_tree_replace_slot(slot, shadow);
-		return;
-	}
+	VM_BUG_ON(node == NULL);
+	VM_BUG_ON(*slot == NULL);
 
 	/* Clear tree tags for the removed page */
-	index = page->index;
-	offset = index & RADIX_TREE_MAP_MASK;
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (test_bit(offset, node->tags[tag]))
+		if (test_bit(index & RADIX_TREE_MAP_MASK, node->tags[tag]))
 			radix_tree_tag_clear(&mapping->page_tree, index, tag);
 	}
 
@@ -173,6 +148,54 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	}
 }
 
+static void page_cache_tree_delete(struct address_space *mapping,
+				   struct page *page, void *shadow)
+{
+	struct radix_tree_node *node;
+	unsigned long index;
+	void **slot;
+	int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(PageTail(page), page);
+
+	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+
+	if (shadow) {
+		mapping->nrexceptional += nr;
+		/*
+		 * Make sure the nrexceptional update is committed before
+		 * the nrpages update so that final truncate racing
+		 * with reclaim does not see both counters 0 at the
+		 * same time and miss a shadow entry.
+		 */
+		smp_wmb();
+	}
+	mapping->nrpages -= nr;
+
+	if (!node) {
+		/* Clear direct pointer tags in root node */
+		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+		VM_BUG_ON(nr != 1);
+		radix_tree_replace_slot(slot, shadow);
+		return;
+	}
+
+	index = page->index;
+	VM_BUG_ON_PAGE(index & (nr - 1), page);
+	for (i = 0; i < nr; i++) {
+		/* Cross node border */
+		if (i && ((index + i) & RADIX_TREE_MAP_MASK) == 0) {
+			__radix_tree_lookup(&mapping->page_tree,
+					page->index + i, &node, &slot);
+		}
+
+		__page_cache_tree_delete(mapping, node,
+				slot + (i & RADIX_TREE_MAP_MASK), index + i,
+				shadow);
+	}
+}
+
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
@@ -181,6 +204,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
 void __delete_from_page_cache(struct page *page, void *shadow)
 {
 	struct address_space *mapping = page->mapping;
+	int nr = hpage_nr_pages(page);
 
 	trace_mm_filemap_delete_from_page_cache(page);
 	/*
@@ -193,6 +217,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 	else
 		cleancache_invalidate_page(mapping, page);
 
+	VM_BUG_ON_PAGE(PageTail(page), page);
 	VM_BUG_ON_PAGE(page_mapped(page), page);
 	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
 		int mapcount;
@@ -224,9 +249,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 
 	/* hugetlb pages do not participate in page cache accounting. */
 	if (!PageHuge(page))
-		__dec_zone_page_state(page, NR_FILE_PAGES);
+		__mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr);
 	if (PageSwapBacked(page))
-		__dec_zone_page_state(page, NR_SHMEM);
+		__mod_zone_page_state(page_zone(page), NR_SHMEM, -nr);
 
 	/*
 	 * At this point page must be either written or cleaned by truncate.
@@ -250,9 +275,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
  */
 void delete_from_page_cache(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 	unsigned long flags;
-
 	void (*freepage)(struct page *);
 
 	BUG_ON(!PageLocked(page));
@@ -265,7 +289,13 @@ void delete_from_page_cache(struct page *page)
 
 	if (freepage)
 		freepage(page);
-	put_page(page);
+
+	if (PageTransHuge(page) && !PageHuge(page)) {
+		page_ref_sub(page, HPAGE_PMD_NR);
+		VM_BUG_ON_PAGE(page_count(page) <= 0, page);
+	} else {
+		put_page(page);
+	}
 }
 EXPORT_SYMBOL(delete_from_page_cache);
 
@@ -1054,7 +1084,7 @@ EXPORT_SYMBOL(page_cache_prev_hole);
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 {
 	void **pagep;
-	struct page *page;
+	struct page *head, *page;
 
 	rcu_read_lock();
 repeat:
@@ -1074,9 +1104,17 @@ repeat:
 			 */
 			goto out;
 		}
-		if (!page_cache_get_speculative(page))
+
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
 			goto repeat;
 
+		/* The page was split under us? */
+		if (compound_head(page) != head) {
+			put_page(page);
+			goto repeat;
+		}
+
 		/*
 		 * Has the page moved?
 		 * This is part of the lockless pagecache protocol. See
@@ -1119,12 +1157,12 @@ repeat:
 	if (page && !radix_tree_exception(page)) {
 		lock_page(page);
 		/* Has the page been truncated? */
-		if (unlikely(page->mapping != mapping)) {
+		if (unlikely(page_mapping(page) != mapping)) {
 			unlock_page(page);
 			put_page(page);
 			goto repeat;
 		}
-		VM_BUG_ON_PAGE(page->index != offset, page);
+		VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
 	}
 	return page;
 }
@@ -1256,7 +1294,7 @@ unsigned find_get_entries(struct address_space *mapping,
 
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-		struct page *page;
+		struct page *head, *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1273,8 +1311,16 @@ repeat:
 			 */
 			goto export;
 		}
-		if (!page_cache_get_speculative(page))
+
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
+			goto repeat;
+
+		/* The page was split under us? */
+		if (compound_head(page) != head) {
+			put_page(page);
 			goto repeat;
+		}
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
@@ -1319,7 +1365,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-		struct page *page;
+		struct page *head, *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1338,9 +1384,16 @@ repeat:
 			continue;
 		}
 
-		if (!page_cache_get_speculative(page))
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
 			goto repeat;
 
+		/* The page was split under us? */
+		if (compound_head(page) != head) {
+			put_page(page);
+			goto repeat;
+		}
+
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
 			put_page(page);
@@ -1380,7 +1433,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 
 	rcu_read_lock();
 	radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
-		struct page *page;
+		struct page *head, *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		/* The hole, there no reason to continue */
@@ -1400,8 +1453,14 @@ repeat:
 			break;
 		}
 
-		if (!page_cache_get_speculative(page))
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
 			goto repeat;
+		/* The page was split under us? */
+		if (compound_head(page) != head) {
+			put_page(page);
+			goto repeat;
+		}
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
@@ -1414,7 +1473,7 @@ repeat:
 		 * otherwise we can get both false positives and false
 		 * negatives, which is just confusing to the caller.
 		 */
-		if (page->mapping == NULL || page->index != iter.index) {
+		if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
 			put_page(page);
 			break;
 		}
@@ -1452,7 +1511,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 	rcu_read_lock();
 	radix_tree_for_each_tagged(slot, &mapping->page_tree,
 				   &iter, *index, tag) {
-		struct page *page;
+		struct page *head, *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1477,8 +1536,15 @@ repeat:
 			continue;
 		}
 
-		if (!page_cache_get_speculative(page))
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
+			goto repeat;
+
+		/* The page was split under us? */
+		if (compound_head(page) != head) {
+			put_page(page);
 			goto repeat;
+		}
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
@@ -1526,7 +1592,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
 	rcu_read_lock();
 	radix_tree_for_each_tagged(slot, &mapping->page_tree,
 				   &iter, start, tag) {
-		struct page *page;
+		struct page *head, *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1544,9 +1610,17 @@ repeat:
 			 */
 			goto export;
 		}
-		if (!page_cache_get_speculative(page))
+
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
 			goto repeat;
 
+		/* The page was split under us? */
+		if (compound_head(page) != head) {
+			put_page(page);
+			goto repeat;
+		}
+
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
 			put_page(page);
@@ -2140,7 +2214,7 @@ void filemap_map_pages(struct fault_env *fe,
 	struct address_space *mapping = file->f_mapping;
 	pgoff_t last_pgoff = start_pgoff;
 	loff_t size;
-	struct page *page;
+	struct page *head, *page;
 
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
@@ -2159,8 +2233,15 @@ repeat:
 			goto next;
 		}
 
-		if (!page_cache_get_speculative(page))
+		head = compound_head(page);
+		if (!page_cache_get_speculative(head))
+			goto repeat;
+
+		/* The page was split under us? */
+		if (compound_head(page) != head) {
+			put_page(page);
 			goto repeat;
+		}
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-- 
2.8.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2016-05-12 15:40 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-05-12 15:40 [PATCHv8 00/32] THP-enabled tmpfs/shmem using compound pages Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 01/32] thp, mlock: update unevictable-lru.txt Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 02/32] mm: do not pass mm_struct into handle_mm_fault Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 03/32] mm: introduce fault_env Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 04/32] mm: postpone page table allocation until we have page to map Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 05/32] rmap: support file thp Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 06/32] mm: introduce do_set_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 07/32] thp, vmstats: add counters for huge file pages Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 08/32] thp: support file pages in zap_huge_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 09/32] thp: handle file pages in split_huge_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 10/32] thp: handle file COW faults Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 11/32] thp: skip file huge pmd on copy_huge_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 12/32] thp: prepare change_huge_pmd() for file thp Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 13/32] thp: run vma_adjust_trans_huge() outside i_mmap_rwsem Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 14/32] thp: file pages support for split_huge_page() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 15/32] thp, mlock: do not mlock PTE-mapped file huge pages Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 16/32] vmscan: split file huge pages before paging them out Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 17/32] page-flags: relax policy for PG_mappedtodisk and PG_reclaim Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 18/32] radix-tree: implement radix_tree_maybe_preload_order() Kirill A. Shutemov
2016-05-12 15:40 ` Kirill A. Shutemov [this message]
2016-05-12 15:41 ` [PATCHv8 20/32] truncate: handle file thp Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 21/32] mm, rmap: account shmem thp pages Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 22/32] shmem: prepare huge= mount option and sysfs knob Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 23/32] shmem: get_unmapped_area align huge page Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 24/32] shmem: add huge pages support Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 25/32] shmem, thp: respect MADV_{NO,}HUGEPAGE for file mappings Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 26/32] thp: update Documentation/vm/transhuge.txt Kirill A. Shutemov
2016-05-19 16:20   ` Julien Grall
2016-05-20 10:33     ` Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 27/32] thp: extract khugepaged from mm/huge_memory.c Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 28/32] khugepaged: move up_read(mmap_sem) out of khugepaged_alloc_page() Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 29/32] shmem: make shmem_inode_info::lock irq-safe Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 30/32] khugepaged: add support of collapse for tmpfs/shmem pages Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 31/32] thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 32/32] shmem: split huge pages beyond i_size under memory pressure Kirill A. Shutemov
2016-05-25 19:11 ` [PATCHv8 00/32] THP-enabled tmpfs/shmem using compound pages neha agarwal
2016-05-25 20:03   ` Kirill A. Shutemov
2016-05-25 21:11     ` neha agarwal
2016-05-25 21:21       ` Kirill A. Shutemov
2016-05-27 16:28         ` neha agarwal
2016-06-06 13:51   ` Kirill A. Shutemov
2016-06-08 18:43     ` neha agarwal
2016-06-13  9:06       ` Kirill A. Shutemov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1463067672-134698-20-git-send-email-kirill.shutemov@linux.intel.com \
    --to=kirill.shutemov@linux.intel.com \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=andreslc@google.com \
    --cc=cl@gentwo.org \
    --cc=dave.hansen@intel.com \
    --cc=hughd@google.com \
    --cc=jmarchan@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=n-horiguchi@ah.jp.nec.com \
    --cc=quning@gmail.com \
    --cc=sasha.levin@oracle.com \
    --cc=vbabka@suse.cz \
    --cc=yang.shi@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).