From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: Hugh Dickins <hughd@google.com>,
Andrea Arcangeli <aarcange@redhat.com>,
Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Christoph Lameter <cl@gentwo.org>,
Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>,
Jerome Marchand <jmarchan@redhat.com>,
Yang Shi <yang.shi@linaro.org>,
Sasha Levin <sasha.levin@oracle.com>,
Andres Lagar-Cavilla <andreslc@google.com>,
Ning Qu <quning@gmail.com>,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-fsdevel@vger.kernel.org,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv8 19/32] filemap: prepare find and delete operations for huge pages
Date: Thu, 12 May 2016 18:40:59 +0300 [thread overview]
Message-ID: <1463067672-134698-20-git-send-email-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <1463067672-134698-1-git-send-email-kirill.shutemov@linux.intel.com>
For now, we would have HPAGE_PMD_NR entries in radix tree for every huge
page. That's suboptimal and it will be changed to use Matthew's
multi-order entries later.
'add' operation is not changed, because we don't need it to implement
hugetmpfs: shmem uses its own implementation.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
mm/filemap.c | 187 ++++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 134 insertions(+), 53 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7e982835d4ec..bf29ab4f87dc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -110,43 +110,18 @@
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
-static void page_cache_tree_delete(struct address_space *mapping,
- struct page *page, void *shadow)
+static void __page_cache_tree_delete(struct address_space *mapping,
+ struct radix_tree_node *node, void **slot, unsigned long index,
+ void *shadow)
{
- struct radix_tree_node *node;
- unsigned long index;
- unsigned int offset;
unsigned int tag;
- void **slot;
-
- VM_BUG_ON(!PageLocked(page));
- __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
-
- if (shadow) {
- mapping->nrexceptional++;
- /*
- * Make sure the nrexceptional update is committed before
- * the nrpages update so that final truncate racing
- * with reclaim does not see both counters 0 at the
- * same time and miss a shadow entry.
- */
- smp_wmb();
- }
- mapping->nrpages--;
-
- if (!node) {
- /* Clear direct pointer tags in root node */
- mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
- radix_tree_replace_slot(slot, shadow);
- return;
- }
+ VM_BUG_ON(node == NULL);
+ VM_BUG_ON(*slot == NULL);
/* Clear tree tags for the removed page */
- index = page->index;
- offset = index & RADIX_TREE_MAP_MASK;
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
- if (test_bit(offset, node->tags[tag]))
+ if (test_bit(index & RADIX_TREE_MAP_MASK, node->tags[tag]))
radix_tree_tag_clear(&mapping->page_tree, index, tag);
}
@@ -173,6 +148,54 @@ static void page_cache_tree_delete(struct address_space *mapping,
}
}
+static void page_cache_tree_delete(struct address_space *mapping,
+ struct page *page, void *shadow)
+{
+ struct radix_tree_node *node;
+ unsigned long index;
+ void **slot;
+ int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+
+ if (shadow) {
+ mapping->nrexceptional += nr;
+ /*
+ * Make sure the nrexceptional update is committed before
+ * the nrpages update so that final truncate racing
+ * with reclaim does not see both counters 0 at the
+ * same time and miss a shadow entry.
+ */
+ smp_wmb();
+ }
+ mapping->nrpages -= nr;
+
+ if (!node) {
+ /* Clear direct pointer tags in root node */
+ mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+ VM_BUG_ON(nr != 1);
+ radix_tree_replace_slot(slot, shadow);
+ return;
+ }
+
+ index = page->index;
+ VM_BUG_ON_PAGE(index & (nr - 1), page);
+ for (i = 0; i < nr; i++) {
+ /* Cross node border */
+ if (i && ((index + i) & RADIX_TREE_MAP_MASK) == 0) {
+ __radix_tree_lookup(&mapping->page_tree,
+ page->index + i, &node, &slot);
+ }
+
+ __page_cache_tree_delete(mapping, node,
+ slot + (i & RADIX_TREE_MAP_MASK), index + i,
+ shadow);
+ }
+}
+
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
@@ -181,6 +204,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
+ int nr = hpage_nr_pages(page);
trace_mm_filemap_delete_from_page_cache(page);
/*
@@ -193,6 +217,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
else
cleancache_invalidate_page(mapping, page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
int mapcount;
@@ -224,9 +249,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
/* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(page))
- __dec_zone_page_state(page, NR_FILE_PAGES);
+ __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page))
- __dec_zone_page_state(page, NR_SHMEM);
+ __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr);
/*
* At this point page must be either written or cleaned by truncate.
@@ -250,9 +275,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
*/
void delete_from_page_cache(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
unsigned long flags;
-
void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
@@ -265,7 +289,13 @@ void delete_from_page_cache(struct page *page)
if (freepage)
freepage(page);
- put_page(page);
+
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page_ref_sub(page, HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(page_count(page) <= 0, page);
+ } else {
+ put_page(page);
+ }
}
EXPORT_SYMBOL(delete_from_page_cache);
@@ -1054,7 +1084,7 @@ EXPORT_SYMBOL(page_cache_prev_hole);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
void **pagep;
- struct page *page;
+ struct page *head, *page;
rcu_read_lock();
repeat:
@@ -1074,9 +1104,17 @@ repeat:
*/
goto out;
}
- if (!page_cache_get_speculative(page))
+
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(page);
+ goto repeat;
+ }
+
/*
* Has the page moved?
* This is part of the lockless pagecache protocol. See
@@ -1119,12 +1157,12 @@ repeat:
if (page && !radix_tree_exception(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(page_mapping(page) != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
}
return page;
}
@@ -1256,7 +1294,7 @@ unsigned find_get_entries(struct address_space *mapping,
rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1273,8 +1311,16 @@ repeat:
*/
goto export;
}
- if (!page_cache_get_speculative(page))
+
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
+ goto repeat;
+
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(page);
goto repeat;
+ }
/* Has the page moved? */
if (unlikely(page != *slot)) {
@@ -1319,7 +1365,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1338,9 +1384,16 @@ repeat:
continue;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(page);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
put_page(page);
@@ -1380,7 +1433,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
/* The hole, there no reason to continue */
@@ -1400,8 +1453,14 @@ repeat:
break;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(page);
+ goto repeat;
+ }
/* Has the page moved? */
if (unlikely(page != *slot)) {
@@ -1414,7 +1473,7 @@ repeat:
* otherwise we can get both false positives and false
* negatives, which is just confusing to the caller.
*/
- if (page->mapping == NULL || page->index != iter.index) {
+ if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
put_page(page);
break;
}
@@ -1452,7 +1511,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
rcu_read_lock();
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1477,8 +1536,15 @@ repeat:
continue;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
+ goto repeat;
+
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(page);
goto repeat;
+ }
/* Has the page moved? */
if (unlikely(page != *slot)) {
@@ -1526,7 +1592,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
rcu_read_lock();
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
- struct page *page;
+ struct page *head, *page;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1544,9 +1610,17 @@ repeat:
*/
goto export;
}
- if (!page_cache_get_speculative(page))
+
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
goto repeat;
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(page);
+ goto repeat;
+ }
+
/* Has the page moved? */
if (unlikely(page != *slot)) {
put_page(page);
@@ -2140,7 +2214,7 @@ void filemap_map_pages(struct fault_env *fe,
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
- struct page *page;
+ struct page *head, *page;
rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
@@ -2159,8 +2233,15 @@ repeat:
goto next;
}
- if (!page_cache_get_speculative(page))
+ head = compound_head(page);
+ if (!page_cache_get_speculative(head))
+ goto repeat;
+
+ /* The page was split under us? */
+ if (compound_head(page) != head) {
+ put_page(page);
goto repeat;
+ }
/* Has the page moved? */
if (unlikely(page != *slot)) {
--
2.8.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2016-05-12 15:40 UTC|newest]
Thread overview: 43+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-05-12 15:40 [PATCHv8 00/32] THP-enabled tmpfs/shmem using compound pages Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 01/32] thp, mlock: update unevictable-lru.txt Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 02/32] mm: do not pass mm_struct into handle_mm_fault Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 03/32] mm: introduce fault_env Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 04/32] mm: postpone page table allocation until we have page to map Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 05/32] rmap: support file thp Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 06/32] mm: introduce do_set_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 07/32] thp, vmstats: add counters for huge file pages Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 08/32] thp: support file pages in zap_huge_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 09/32] thp: handle file pages in split_huge_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 10/32] thp: handle file COW faults Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 11/32] thp: skip file huge pmd on copy_huge_pmd() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 12/32] thp: prepare change_huge_pmd() for file thp Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 13/32] thp: run vma_adjust_trans_huge() outside i_mmap_rwsem Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 14/32] thp: file pages support for split_huge_page() Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 15/32] thp, mlock: do not mlock PTE-mapped file huge pages Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 16/32] vmscan: split file huge pages before paging them out Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 17/32] page-flags: relax policy for PG_mappedtodisk and PG_reclaim Kirill A. Shutemov
2016-05-12 15:40 ` [PATCHv8 18/32] radix-tree: implement radix_tree_maybe_preload_order() Kirill A. Shutemov
2016-05-12 15:40 ` Kirill A. Shutemov [this message]
2016-05-12 15:41 ` [PATCHv8 20/32] truncate: handle file thp Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 21/32] mm, rmap: account shmem thp pages Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 22/32] shmem: prepare huge= mount option and sysfs knob Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 23/32] shmem: get_unmapped_area align huge page Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 24/32] shmem: add huge pages support Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 25/32] shmem, thp: respect MADV_{NO,}HUGEPAGE for file mappings Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 26/32] thp: update Documentation/vm/transhuge.txt Kirill A. Shutemov
2016-05-19 16:20 ` Julien Grall
2016-05-20 10:33 ` Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 27/32] thp: extract khugepaged from mm/huge_memory.c Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 28/32] khugepaged: move up_read(mmap_sem) out of khugepaged_alloc_page() Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 29/32] shmem: make shmem_inode_info::lock irq-safe Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 30/32] khugepaged: add support of collapse for tmpfs/shmem pages Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 31/32] thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE Kirill A. Shutemov
2016-05-12 15:41 ` [PATCHv8 32/32] shmem: split huge pages beyond i_size under memory pressure Kirill A. Shutemov
2016-05-25 19:11 ` [PATCHv8 00/32] THP-enabled tmpfs/shmem using compound pages neha agarwal
2016-05-25 20:03 ` Kirill A. Shutemov
2016-05-25 21:11 ` neha agarwal
2016-05-25 21:21 ` Kirill A. Shutemov
2016-05-27 16:28 ` neha agarwal
2016-06-06 13:51 ` Kirill A. Shutemov
2016-06-08 18:43 ` neha agarwal
2016-06-13 9:06 ` Kirill A. Shutemov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1463067672-134698-20-git-send-email-kirill.shutemov@linux.intel.com \
--to=kirill.shutemov@linux.intel.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=andreslc@google.com \
--cc=cl@gentwo.org \
--cc=dave.hansen@intel.com \
--cc=hughd@google.com \
--cc=jmarchan@redhat.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=n-horiguchi@ah.jp.nec.com \
--cc=quning@gmail.com \
--cc=sasha.levin@oracle.com \
--cc=vbabka@suse.cz \
--cc=yang.shi@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).